Add AhoCorasick
(#4465)
* Added code to find Articulation Points and Bridges * tried to solve clang-formant test * removed new line at EOF to get lint to pass * feature: Added Ahocorasick Algorithm * fixed lint using clang-format * removed datastructures/graphs/ArticulationPointsAndBridge.java from this branch * removed main, since test-file is added. Also modified and renamed few functions. * Added test-file for AhoCorasick Algorithm * Modified some comments in test-file * Modified some comments in AhoCorasick.java * lint fix * added few more test cases * Modified some comments * Change all class fields to private, added initializeSuffixLinksForChildNodesOfTheRoot() method, hashmap string search position (also has previous index based search), removed java.util.* * Added Missing Test-Cases and more * minor text changes * added direct test check i.e. defining a variable expected and just checking if res and expected are equal. * Created New Class Trie, merged 'buildTrie and buildSuffixAndOutputLinks' with 'Trie constructor'. Merged setUpStartPoints with searchIn. Now AhoCorasick contains -> inner class: Trie, Node. Methods: search and convert. Trie has -> Methods : constructor and searchIn * Updated TestFile according to the updated AhoCorasick Class. Added Few more test cases * updated - broken down constructor to relavent parts, made string final, made res local to searchIn(), doxygen-like style * lint fix clang * Updated Tests Files * Added final field to Node class setters and Trie Constructor arguments, removed getTrieRoot() and some unnecessory comments, renamed [old -> new]: res -> positionByStringIndexValue, removed if condition from setupStartPoints() * updated test file * lint fix clang * minor chage - 'removed a comment' * added final fields to some arguments, class and variables, added a method initializePositionByStringIndexValue() * updated to remove * inclusion and added the required modules only * Implemented a new class PatternPositionRecorder to wrap up the position recording in searchIn() * Added final fields to PatternPositionRecorder Class * style: mark default constructor of `AhoCorasick` as `private` * style: remoce redundant `public` --------- Co-authored-by: Piotr Idzik <65706193+vil02@users.noreply.github.com>
This commit is contained in:
parent
06aa834fa6
commit
8200a791a2
249
src/main/java/com/thealgorithms/strings/AhoCorasick.java
Normal file
249
src/main/java/com/thealgorithms/strings/AhoCorasick.java
Normal file
@ -0,0 +1,249 @@
|
||||
/*
|
||||
* Aho-Corasick String Matching Algorithm Implementation
|
||||
*
|
||||
* This code implements the Aho-Corasick algorithm, which is used for efficient
|
||||
* string matching in a given text. It can find multiple patterns simultaneously
|
||||
* and records their positions in the text.
|
||||
*
|
||||
* Author: Prabhat-Kumar-42
|
||||
* GitHub: https://github.com/Prabhat-Kumar-42
|
||||
*/
|
||||
|
||||
package com.thealgorithms.strings;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
|
||||
public final class AhoCorasick {
|
||||
private AhoCorasick() {
|
||||
}
|
||||
|
||||
// Trie Node Class
|
||||
private static class Node {
|
||||
// Represents a character in the trie
|
||||
private HashMap<Character, Node> child = new HashMap<>(); // Child nodes of the current node
|
||||
private Node suffixLink; // Suffix link to another node in the trie
|
||||
private Node outputLink; // Output link to another node in the trie
|
||||
private int patternInd; // Index of the pattern that ends at this node
|
||||
|
||||
Node() {
|
||||
this.suffixLink = null;
|
||||
this.outputLink = null;
|
||||
this.patternInd = -1;
|
||||
}
|
||||
|
||||
public HashMap<Character, Node> getChild() {
|
||||
return child;
|
||||
}
|
||||
|
||||
public Node getSuffixLink() {
|
||||
return suffixLink;
|
||||
}
|
||||
|
||||
public void setSuffixLink(final Node suffixLink) {
|
||||
this.suffixLink = suffixLink;
|
||||
}
|
||||
|
||||
public Node getOutputLink() {
|
||||
return outputLink;
|
||||
}
|
||||
|
||||
public void setOutputLink(final Node outputLink) {
|
||||
this.outputLink = outputLink;
|
||||
}
|
||||
|
||||
public int getPatternInd() {
|
||||
return patternInd;
|
||||
}
|
||||
|
||||
public void setPatternInd(final int patternInd) {
|
||||
this.patternInd = patternInd;
|
||||
}
|
||||
}
|
||||
|
||||
// Trie Class
|
||||
public static class Trie {
|
||||
|
||||
private Node root = null; // Root node of the trie
|
||||
private final String[] patterns; // patterns according to which Trie is constructed
|
||||
|
||||
public Trie(final String[] patterns) {
|
||||
root = new Node(); // Initialize the root of the trie
|
||||
this.patterns = patterns;
|
||||
buildTrie();
|
||||
buildSuffixAndOutputLinks();
|
||||
}
|
||||
|
||||
// builds AhoCorasick Trie
|
||||
private void buildTrie() {
|
||||
|
||||
// Loop through each input pattern and building Trie
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
Node curr = root; // Start at the root of the trie for each pattern
|
||||
|
||||
// Loop through each character in the current pattern
|
||||
for (int j = 0; j < patterns[i].length(); j++) {
|
||||
char c = patterns[i].charAt(j); // Get the current character
|
||||
|
||||
// Check if the current node has a child for the current character
|
||||
if (curr.getChild().containsKey(c)) {
|
||||
curr = curr.getChild().get(c); // Update the current node to the child node
|
||||
} else {
|
||||
// If no child node exists, create a new one and add it to the current node's children
|
||||
Node nn = new Node();
|
||||
curr.getChild().put(c, nn);
|
||||
curr = nn; // Update the current node to the new child node
|
||||
}
|
||||
}
|
||||
curr.setPatternInd(i); // Store the index of the pattern in the current leaf node
|
||||
}
|
||||
}
|
||||
|
||||
private void initializeSuffixLinksForChildNodesOfTheRoot(Queue<Node> q) {
|
||||
for (char rc : root.getChild().keySet()) {
|
||||
Node childNode = root.getChild().get(rc);
|
||||
q.add(childNode); // Add child node to the queue
|
||||
childNode.setSuffixLink(root); // Set suffix link to the root
|
||||
}
|
||||
}
|
||||
|
||||
private void buildSuffixAndOutputLinks() {
|
||||
root.setSuffixLink(root); // Initialize the suffix link of the root to itself
|
||||
Queue<Node> q = new LinkedList<>(); // Initialize a queue for BFS traversal
|
||||
|
||||
initializeSuffixLinksForChildNodesOfTheRoot(q);
|
||||
|
||||
while (!q.isEmpty()) {
|
||||
Node currentState = q.poll(); // Get the current node for processing
|
||||
|
||||
// Iterate through child nodes of the current node
|
||||
for (char cc : currentState.getChild().keySet()) {
|
||||
Node currentChild = currentState.getChild().get(cc); // Get the child node
|
||||
Node parentSuffix = currentState.getSuffixLink(); // Get the parent's suffix link
|
||||
|
||||
// Calculate the suffix link for the child based on the parent's suffix link
|
||||
while (!parentSuffix.getChild().containsKey(cc) && parentSuffix != root) {
|
||||
parentSuffix = parentSuffix.getSuffixLink();
|
||||
}
|
||||
|
||||
// Set the calculated suffix link or default to root
|
||||
if (parentSuffix.getChild().containsKey(cc)) {
|
||||
currentChild.setSuffixLink(parentSuffix.getChild().get(cc));
|
||||
} else {
|
||||
currentChild.setSuffixLink(root);
|
||||
}
|
||||
|
||||
q.add(currentChild); // Add the child node to the queue for further processing
|
||||
}
|
||||
|
||||
// Establish output links for nodes to efficiently identify patterns within patterns
|
||||
if (currentState.getSuffixLink().getPatternInd() >= 0) {
|
||||
currentState.setOutputLink(currentState.getSuffixLink());
|
||||
} else {
|
||||
currentState.setOutputLink(currentState.getSuffixLink().getOutputLink());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ArrayList<ArrayList<Integer>> initializePositionByStringIndexValue() {
|
||||
ArrayList<ArrayList<Integer>> positionByStringIndexValue = new ArrayList<>(patterns.length); // Stores positions where patterns are found in the text
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
positionByStringIndexValue.add(new ArrayList<Integer>());
|
||||
}
|
||||
return positionByStringIndexValue;
|
||||
}
|
||||
|
||||
// Searches for patterns in the input text and records their positions
|
||||
public ArrayList<ArrayList<Integer>> searchIn(final String text) {
|
||||
var positionByStringIndexValue = initializePositionByStringIndexValue(); // Initialize a list to store positions of the current pattern
|
||||
Node parent = root; // Start searching from the root node
|
||||
|
||||
PatternPositionRecorder positionRecorder = new PatternPositionRecorder(positionByStringIndexValue);
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char ch = text.charAt(i); // Get the current character in the text
|
||||
|
||||
// Check if the current node has a child for the current character
|
||||
if (parent.getChild().containsKey(ch)) {
|
||||
parent = parent.getChild().get(ch); // Update the current node to the child node
|
||||
positionRecorder.recordPatternPositions(parent, i); // Use the method in PatternPositionRecorder to record positions
|
||||
} else {
|
||||
// If no child node exists for the character, backtrack using suffix links
|
||||
while (parent != root && !parent.getChild().containsKey(ch)) {
|
||||
parent = parent.getSuffixLink();
|
||||
}
|
||||
if (parent.getChild().containsKey(ch)) {
|
||||
i--; // Decrement i to reprocess the same character
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
setUpStartPoints(positionByStringIndexValue);
|
||||
return positionByStringIndexValue;
|
||||
}
|
||||
|
||||
// by default positionByStringIndexValue contains end-points. This function converts those
|
||||
// endpoints to start points
|
||||
private void setUpStartPoints(ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
for (int j = 0; j < positionByStringIndexValue.get(i).size(); j++) {
|
||||
int endpoint = positionByStringIndexValue.get(i).get(j);
|
||||
positionByStringIndexValue.get(i).set(j, endpoint - patterns[i].length() + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Class to handle pattern position recording
|
||||
private static class PatternPositionRecorder {
|
||||
private ArrayList<ArrayList<Integer>> positionByStringIndexValue;
|
||||
|
||||
// Constructor to initialize the recorder with the position list
|
||||
PatternPositionRecorder(final ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
|
||||
this.positionByStringIndexValue = positionByStringIndexValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Records positions for a pattern when it's found in the input text and follows
|
||||
* output links to record positions of other patterns.
|
||||
*
|
||||
* @param parent The current node representing a character in the pattern trie.
|
||||
* @param currentPosition The current position in the input text.
|
||||
*/
|
||||
public void recordPatternPositions(final Node parent, final int currentPosition) {
|
||||
// Check if the current node represents the end of a pattern
|
||||
if (parent.getPatternInd() > -1) {
|
||||
// Add the current position to the list of positions for the found pattern
|
||||
positionByStringIndexValue.get(parent.getPatternInd()).add(currentPosition);
|
||||
}
|
||||
|
||||
Node outputLink = parent.getOutputLink();
|
||||
// Follow output links to find and record positions of other patterns
|
||||
while (outputLink != null) {
|
||||
// Add the current position to the list of positions for the pattern linked by outputLink
|
||||
positionByStringIndexValue.get(outputLink.getPatternInd()).add(currentPosition);
|
||||
outputLink = outputLink.getOutputLink();
|
||||
}
|
||||
}
|
||||
}
|
||||
// method to search for patterns in text
|
||||
public static Map<String, ArrayList<Integer>> search(final String text, final String[] patterns) {
|
||||
final var trie = new Trie(patterns);
|
||||
final var positionByStringIndexValue = trie.searchIn(text);
|
||||
return convert(positionByStringIndexValue, patterns);
|
||||
}
|
||||
|
||||
// method for converting results to a map
|
||||
private static Map<String, ArrayList<Integer>> convert(final ArrayList<ArrayList<Integer>> positionByStringIndexValue, final String[] patterns) {
|
||||
Map<String, ArrayList<Integer>> positionByString = new HashMap<>();
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
String pattern = patterns[i];
|
||||
ArrayList<Integer> positions = positionByStringIndexValue.get(i);
|
||||
positionByString.put(pattern, new ArrayList<>(positions));
|
||||
}
|
||||
return positionByString;
|
||||
}
|
||||
}
|
120
src/test/java/com/thealgorithms/strings/AhoCorasickTest.java
Normal file
120
src/test/java/com/thealgorithms/strings/AhoCorasickTest.java
Normal file
@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Tests For Aho-Corasick String Matching Algorithm
|
||||
*
|
||||
* Author: Prabhat-Kumar-42
|
||||
* GitHub: https://github.com/Prabhat-Kumar-42
|
||||
*/
|
||||
|
||||
package com.thealgorithms.strings;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* This class contains test cases for the Aho-Corasick String Matching Algorithm.
|
||||
* The Aho-Corasick algorithm is used to efficiently find all occurrences of multiple
|
||||
* patterns in a given text.
|
||||
*/
|
||||
class AhoCorasickTest {
|
||||
private String[] patterns; // The array of patterns to search for
|
||||
private String text; // The input text to search within
|
||||
|
||||
/**
|
||||
* This method sets up the test environment before each test case.
|
||||
* It initializes the patterns and text to be used for testing.
|
||||
*/
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
patterns = new String[] {"ACC", "ATC", "CAT", "GCG", "C", "T"};
|
||||
text = "GCATCG";
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for multiple patterns in the input text.
|
||||
* The expected results are defined for each pattern.
|
||||
*/
|
||||
@Test
|
||||
void testSearch() {
|
||||
// Define the expected results for each pattern
|
||||
final var expected = Map.of("ACC", new ArrayList<>(Arrays.asList()), "ATC", new ArrayList<>(Arrays.asList(2)), "CAT", new ArrayList<>(Arrays.asList(1)), "GCG", new ArrayList<>(Arrays.asList()), "C", new ArrayList<>(Arrays.asList(1, 4)), "T", new ArrayList<>(Arrays.asList(3)));
|
||||
assertEquals(expected, AhoCorasick.search(text, patterns));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching with an empty pattern array.
|
||||
* The result should be an empty map.
|
||||
*/
|
||||
@Test
|
||||
void testEmptyPatterns() {
|
||||
// Define an empty pattern array
|
||||
final var emptyPatterns = new String[] {};
|
||||
assertTrue(AhoCorasick.search(text, emptyPatterns).isEmpty());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for patterns that are not present in the input text.
|
||||
* The result should be an empty list for each pattern.
|
||||
*/
|
||||
@Test
|
||||
void testPatternNotFound() {
|
||||
// Define patterns that are not present in the text
|
||||
final var searchPatterns = new String[] {"XYZ", "123"};
|
||||
final var expected = Map.of("XYZ", new ArrayList<Integer>(), "123", new ArrayList<Integer>());
|
||||
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for patterns that start at the beginning of the input text.
|
||||
* The expected position for each pattern is 0.
|
||||
*/
|
||||
@Test
|
||||
void testPatternAtBeginning() {
|
||||
// Define patterns that start at the beginning of the text
|
||||
final var searchPatterns = new String[] {"GC", "GCA", "GCAT"};
|
||||
final var expected = Map.of("GC", new ArrayList<Integer>(Arrays.asList(0)), "GCA", new ArrayList<Integer>(Arrays.asList(0)), "GCAT", new ArrayList<Integer>(Arrays.asList(0)));
|
||||
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for patterns that end at the end of the input text.
|
||||
* The expected positions are 4, 3, and 2 for the patterns.
|
||||
*/
|
||||
@Test
|
||||
void testPatternAtEnd() {
|
||||
// Define patterns that end at the end of the text
|
||||
final var searchPatterns = new String[] {"CG", "TCG", "ATCG"};
|
||||
final var expected = Map.of("CG", new ArrayList<Integer>(Arrays.asList(4)), "TCG", new ArrayList<Integer>(Arrays.asList(3)), "ATCG", new ArrayList<Integer>(Arrays.asList(2)));
|
||||
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for patterns with multiple occurrences in the input text.
|
||||
* The expected sizes are 1 and 1, and the expected positions are 2 and 3
|
||||
* for the patterns "AT" and "T" respectively.
|
||||
*/
|
||||
@Test
|
||||
void testMultipleOccurrencesOfPattern() {
|
||||
// Define patterns with multiple occurrences in the text
|
||||
final var searchPatterns = new String[] {"AT", "T"};
|
||||
final var expected = Map.of("AT", new ArrayList<Integer>(Arrays.asList(2)), "T", new ArrayList<Integer>(Arrays.asList(3)));
|
||||
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test searching for patterns in a case-insensitive manner.
|
||||
* The search should consider patterns regardless of their case.
|
||||
*/
|
||||
@Test
|
||||
void testCaseInsensitiveSearch() {
|
||||
// Define patterns with different cases
|
||||
final var searchPatterns = new String[] {"gca", "aTc", "C"};
|
||||
final var expected = Map.of("gca", new ArrayList<Integer>(), "aTc", new ArrayList<Integer>(), "C", new ArrayList<Integer>(Arrays.asList(1, 4)));
|
||||
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user