Add AhoCorasick (#4465)

* Added code to find Articulation Points and Bridges

* tried to solve clang-formant test

* removed new line at EOF to get lint to pass

* feature: Added Ahocorasick Algorithm

* fixed lint using clang-format

* removed datastructures/graphs/ArticulationPointsAndBridge.java from this branch

* removed main, since test-file is added. Also modified and renamed few functions.

* Added test-file for AhoCorasick Algorithm

* Modified some comments in test-file

* Modified some comments in AhoCorasick.java

* lint fix

* added few more test cases

* Modified some comments

* Change all class fields to private, added initializeSuffixLinksForChildNodesOfTheRoot() method, hashmap string search position (also has previous index based search), removed java.util.*

* Added Missing Test-Cases and more

* minor text changes

* added direct test check i.e. defining a variable expected and just checking if res and expected are equal.

* Created New Class Trie, merged 'buildTrie and buildSuffixAndOutputLinks' with 'Trie constructor'. Merged setUpStartPoints with searchIn. Now AhoCorasick contains -> inner class: Trie, Node. Methods: search and convert. Trie has -> Methods : constructor and searchIn

* Updated TestFile according to the updated AhoCorasick Class. Added Few more test cases

* updated - broken down constructor to relavent parts, made string final, made res local to searchIn(), doxygen-like style

* lint fix clang

* Updated Tests Files

* Added final field to Node class setters and Trie Constructor arguments, removed getTrieRoot() and some unnecessory comments, renamed [old -> new]: res -> positionByStringIndexValue, removed if condition from setupStartPoints()

* updated test file

* lint fix clang

* minor chage - 'removed a comment'

* added final fields to some arguments, class and variables, added a method initializePositionByStringIndexValue()

* updated to remove * inclusion and added the required modules only

* Implemented a new class PatternPositionRecorder to wrap up the position recording in searchIn()

* Added final fields to PatternPositionRecorder Class

* style: mark default constructor of `AhoCorasick` as `private`

* style: remoce redundant `public`

---------

Co-authored-by: Piotr Idzik <65706193+vil02@users.noreply.github.com>
This commit is contained in:
Prabhat-Kumar 2023-10-08 18:46:06 +05:30 committed by GitHub
parent 06aa834fa6
commit 8200a791a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 369 additions and 0 deletions

View File

@ -0,0 +1,249 @@
/*
* Aho-Corasick String Matching Algorithm Implementation
*
* This code implements the Aho-Corasick algorithm, which is used for efficient
* string matching in a given text. It can find multiple patterns simultaneously
* and records their positions in the text.
*
* Author: Prabhat-Kumar-42
* GitHub: https://github.com/Prabhat-Kumar-42
*/
package com.thealgorithms.strings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
public final class AhoCorasick {
private AhoCorasick() {
}
// Trie Node Class
private static class Node {
// Represents a character in the trie
private HashMap<Character, Node> child = new HashMap<>(); // Child nodes of the current node
private Node suffixLink; // Suffix link to another node in the trie
private Node outputLink; // Output link to another node in the trie
private int patternInd; // Index of the pattern that ends at this node
Node() {
this.suffixLink = null;
this.outputLink = null;
this.patternInd = -1;
}
public HashMap<Character, Node> getChild() {
return child;
}
public Node getSuffixLink() {
return suffixLink;
}
public void setSuffixLink(final Node suffixLink) {
this.suffixLink = suffixLink;
}
public Node getOutputLink() {
return outputLink;
}
public void setOutputLink(final Node outputLink) {
this.outputLink = outputLink;
}
public int getPatternInd() {
return patternInd;
}
public void setPatternInd(final int patternInd) {
this.patternInd = patternInd;
}
}
// Trie Class
public static class Trie {
private Node root = null; // Root node of the trie
private final String[] patterns; // patterns according to which Trie is constructed
public Trie(final String[] patterns) {
root = new Node(); // Initialize the root of the trie
this.patterns = patterns;
buildTrie();
buildSuffixAndOutputLinks();
}
// builds AhoCorasick Trie
private void buildTrie() {
// Loop through each input pattern and building Trie
for (int i = 0; i < patterns.length; i++) {
Node curr = root; // Start at the root of the trie for each pattern
// Loop through each character in the current pattern
for (int j = 0; j < patterns[i].length(); j++) {
char c = patterns[i].charAt(j); // Get the current character
// Check if the current node has a child for the current character
if (curr.getChild().containsKey(c)) {
curr = curr.getChild().get(c); // Update the current node to the child node
} else {
// If no child node exists, create a new one and add it to the current node's children
Node nn = new Node();
curr.getChild().put(c, nn);
curr = nn; // Update the current node to the new child node
}
}
curr.setPatternInd(i); // Store the index of the pattern in the current leaf node
}
}
private void initializeSuffixLinksForChildNodesOfTheRoot(Queue<Node> q) {
for (char rc : root.getChild().keySet()) {
Node childNode = root.getChild().get(rc);
q.add(childNode); // Add child node to the queue
childNode.setSuffixLink(root); // Set suffix link to the root
}
}
private void buildSuffixAndOutputLinks() {
root.setSuffixLink(root); // Initialize the suffix link of the root to itself
Queue<Node> q = new LinkedList<>(); // Initialize a queue for BFS traversal
initializeSuffixLinksForChildNodesOfTheRoot(q);
while (!q.isEmpty()) {
Node currentState = q.poll(); // Get the current node for processing
// Iterate through child nodes of the current node
for (char cc : currentState.getChild().keySet()) {
Node currentChild = currentState.getChild().get(cc); // Get the child node
Node parentSuffix = currentState.getSuffixLink(); // Get the parent's suffix link
// Calculate the suffix link for the child based on the parent's suffix link
while (!parentSuffix.getChild().containsKey(cc) && parentSuffix != root) {
parentSuffix = parentSuffix.getSuffixLink();
}
// Set the calculated suffix link or default to root
if (parentSuffix.getChild().containsKey(cc)) {
currentChild.setSuffixLink(parentSuffix.getChild().get(cc));
} else {
currentChild.setSuffixLink(root);
}
q.add(currentChild); // Add the child node to the queue for further processing
}
// Establish output links for nodes to efficiently identify patterns within patterns
if (currentState.getSuffixLink().getPatternInd() >= 0) {
currentState.setOutputLink(currentState.getSuffixLink());
} else {
currentState.setOutputLink(currentState.getSuffixLink().getOutputLink());
}
}
}
private ArrayList<ArrayList<Integer>> initializePositionByStringIndexValue() {
ArrayList<ArrayList<Integer>> positionByStringIndexValue = new ArrayList<>(patterns.length); // Stores positions where patterns are found in the text
for (int i = 0; i < patterns.length; i++) {
positionByStringIndexValue.add(new ArrayList<Integer>());
}
return positionByStringIndexValue;
}
// Searches for patterns in the input text and records their positions
public ArrayList<ArrayList<Integer>> searchIn(final String text) {
var positionByStringIndexValue = initializePositionByStringIndexValue(); // Initialize a list to store positions of the current pattern
Node parent = root; // Start searching from the root node
PatternPositionRecorder positionRecorder = new PatternPositionRecorder(positionByStringIndexValue);
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i); // Get the current character in the text
// Check if the current node has a child for the current character
if (parent.getChild().containsKey(ch)) {
parent = parent.getChild().get(ch); // Update the current node to the child node
positionRecorder.recordPatternPositions(parent, i); // Use the method in PatternPositionRecorder to record positions
} else {
// If no child node exists for the character, backtrack using suffix links
while (parent != root && !parent.getChild().containsKey(ch)) {
parent = parent.getSuffixLink();
}
if (parent.getChild().containsKey(ch)) {
i--; // Decrement i to reprocess the same character
}
}
}
setUpStartPoints(positionByStringIndexValue);
return positionByStringIndexValue;
}
// by default positionByStringIndexValue contains end-points. This function converts those
// endpoints to start points
private void setUpStartPoints(ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
for (int i = 0; i < patterns.length; i++) {
for (int j = 0; j < positionByStringIndexValue.get(i).size(); j++) {
int endpoint = positionByStringIndexValue.get(i).get(j);
positionByStringIndexValue.get(i).set(j, endpoint - patterns[i].length() + 1);
}
}
}
}
// Class to handle pattern position recording
private static class PatternPositionRecorder {
private ArrayList<ArrayList<Integer>> positionByStringIndexValue;
// Constructor to initialize the recorder with the position list
PatternPositionRecorder(final ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
this.positionByStringIndexValue = positionByStringIndexValue;
}
/**
* Records positions for a pattern when it's found in the input text and follows
* output links to record positions of other patterns.
*
* @param parent The current node representing a character in the pattern trie.
* @param currentPosition The current position in the input text.
*/
public void recordPatternPositions(final Node parent, final int currentPosition) {
// Check if the current node represents the end of a pattern
if (parent.getPatternInd() > -1) {
// Add the current position to the list of positions for the found pattern
positionByStringIndexValue.get(parent.getPatternInd()).add(currentPosition);
}
Node outputLink = parent.getOutputLink();
// Follow output links to find and record positions of other patterns
while (outputLink != null) {
// Add the current position to the list of positions for the pattern linked by outputLink
positionByStringIndexValue.get(outputLink.getPatternInd()).add(currentPosition);
outputLink = outputLink.getOutputLink();
}
}
}
// method to search for patterns in text
public static Map<String, ArrayList<Integer>> search(final String text, final String[] patterns) {
final var trie = new Trie(patterns);
final var positionByStringIndexValue = trie.searchIn(text);
return convert(positionByStringIndexValue, patterns);
}
// method for converting results to a map
private static Map<String, ArrayList<Integer>> convert(final ArrayList<ArrayList<Integer>> positionByStringIndexValue, final String[] patterns) {
Map<String, ArrayList<Integer>> positionByString = new HashMap<>();
for (int i = 0; i < patterns.length; i++) {
String pattern = patterns[i];
ArrayList<Integer> positions = positionByStringIndexValue.get(i);
positionByString.put(pattern, new ArrayList<>(positions));
}
return positionByString;
}
}

View File

@ -0,0 +1,120 @@
/*
* Tests For Aho-Corasick String Matching Algorithm
*
* Author: Prabhat-Kumar-42
* GitHub: https://github.com/Prabhat-Kumar-42
*/
package com.thealgorithms.strings;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
/**
* This class contains test cases for the Aho-Corasick String Matching Algorithm.
* The Aho-Corasick algorithm is used to efficiently find all occurrences of multiple
* patterns in a given text.
*/
class AhoCorasickTest {
private String[] patterns; // The array of patterns to search for
private String text; // The input text to search within
/**
* This method sets up the test environment before each test case.
* It initializes the patterns and text to be used for testing.
*/
@BeforeEach
void setUp() {
patterns = new String[] {"ACC", "ATC", "CAT", "GCG", "C", "T"};
text = "GCATCG";
}
/**
* Test searching for multiple patterns in the input text.
* The expected results are defined for each pattern.
*/
@Test
void testSearch() {
// Define the expected results for each pattern
final var expected = Map.of("ACC", new ArrayList<>(Arrays.asList()), "ATC", new ArrayList<>(Arrays.asList(2)), "CAT", new ArrayList<>(Arrays.asList(1)), "GCG", new ArrayList<>(Arrays.asList()), "C", new ArrayList<>(Arrays.asList(1, 4)), "T", new ArrayList<>(Arrays.asList(3)));
assertEquals(expected, AhoCorasick.search(text, patterns));
}
/**
* Test searching with an empty pattern array.
* The result should be an empty map.
*/
@Test
void testEmptyPatterns() {
// Define an empty pattern array
final var emptyPatterns = new String[] {};
assertTrue(AhoCorasick.search(text, emptyPatterns).isEmpty());
}
/**
* Test searching for patterns that are not present in the input text.
* The result should be an empty list for each pattern.
*/
@Test
void testPatternNotFound() {
// Define patterns that are not present in the text
final var searchPatterns = new String[] {"XYZ", "123"};
final var expected = Map.of("XYZ", new ArrayList<Integer>(), "123", new ArrayList<Integer>());
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
}
/**
* Test searching for patterns that start at the beginning of the input text.
* The expected position for each pattern is 0.
*/
@Test
void testPatternAtBeginning() {
// Define patterns that start at the beginning of the text
final var searchPatterns = new String[] {"GC", "GCA", "GCAT"};
final var expected = Map.of("GC", new ArrayList<Integer>(Arrays.asList(0)), "GCA", new ArrayList<Integer>(Arrays.asList(0)), "GCAT", new ArrayList<Integer>(Arrays.asList(0)));
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
}
/**
* Test searching for patterns that end at the end of the input text.
* The expected positions are 4, 3, and 2 for the patterns.
*/
@Test
void testPatternAtEnd() {
// Define patterns that end at the end of the text
final var searchPatterns = new String[] {"CG", "TCG", "ATCG"};
final var expected = Map.of("CG", new ArrayList<Integer>(Arrays.asList(4)), "TCG", new ArrayList<Integer>(Arrays.asList(3)), "ATCG", new ArrayList<Integer>(Arrays.asList(2)));
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
}
/**
* Test searching for patterns with multiple occurrences in the input text.
* The expected sizes are 1 and 1, and the expected positions are 2 and 3
* for the patterns "AT" and "T" respectively.
*/
@Test
void testMultipleOccurrencesOfPattern() {
// Define patterns with multiple occurrences in the text
final var searchPatterns = new String[] {"AT", "T"};
final var expected = Map.of("AT", new ArrayList<Integer>(Arrays.asList(2)), "T", new ArrayList<Integer>(Arrays.asList(3)));
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
}
/**
* Test searching for patterns in a case-insensitive manner.
* The search should consider patterns regardless of their case.
*/
@Test
void testCaseInsensitiveSearch() {
// Define patterns with different cases
final var searchPatterns = new String[] {"gca", "aTc", "C"};
final var expected = Map.of("gca", new ArrayList<Integer>(), "aTc", new ArrayList<Integer>(), "C", new ArrayList<Integer>(Arrays.asList(1, 4)));
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
}
}