From 8200a791a28002ca23ac70c73d4c83399beab831 Mon Sep 17 00:00:00 2001 From: Prabhat-Kumar Date: Sun, 8 Oct 2023 18:46:06 +0530 Subject: [PATCH] Add `AhoCorasick` (#4465) * Added code to find Articulation Points and Bridges * tried to solve clang-formant test * removed new line at EOF to get lint to pass * feature: Added Ahocorasick Algorithm * fixed lint using clang-format * removed datastructures/graphs/ArticulationPointsAndBridge.java from this branch * removed main, since test-file is added. Also modified and renamed few functions. * Added test-file for AhoCorasick Algorithm * Modified some comments in test-file * Modified some comments in AhoCorasick.java * lint fix * added few more test cases * Modified some comments * Change all class fields to private, added initializeSuffixLinksForChildNodesOfTheRoot() method, hashmap string search position (also has previous index based search), removed java.util.* * Added Missing Test-Cases and more * minor text changes * added direct test check i.e. defining a variable expected and just checking if res and expected are equal. * Created New Class Trie, merged 'buildTrie and buildSuffixAndOutputLinks' with 'Trie constructor'. Merged setUpStartPoints with searchIn. Now AhoCorasick contains -> inner class: Trie, Node. Methods: search and convert. Trie has -> Methods : constructor and searchIn * Updated TestFile according to the updated AhoCorasick Class. Added Few more test cases * updated - broken down constructor to relavent parts, made string final, made res local to searchIn(), doxygen-like style * lint fix clang * Updated Tests Files * Added final field to Node class setters and Trie Constructor arguments, removed getTrieRoot() and some unnecessory comments, renamed [old -> new]: res -> positionByStringIndexValue, removed if condition from setupStartPoints() * updated test file * lint fix clang * minor chage - 'removed a comment' * added final fields to some arguments, class and variables, added a method initializePositionByStringIndexValue() * updated to remove * inclusion and added the required modules only * Implemented a new class PatternPositionRecorder to wrap up the position recording in searchIn() * Added final fields to PatternPositionRecorder Class * style: mark default constructor of `AhoCorasick` as `private` * style: remoce redundant `public` --------- Co-authored-by: Piotr Idzik <65706193+vil02@users.noreply.github.com> --- .../thealgorithms/strings/AhoCorasick.java | 249 ++++++++++++++++++ .../strings/AhoCorasickTest.java | 120 +++++++++ 2 files changed, 369 insertions(+) create mode 100644 src/main/java/com/thealgorithms/strings/AhoCorasick.java create mode 100644 src/test/java/com/thealgorithms/strings/AhoCorasickTest.java diff --git a/src/main/java/com/thealgorithms/strings/AhoCorasick.java b/src/main/java/com/thealgorithms/strings/AhoCorasick.java new file mode 100644 index 00000000..6381830c --- /dev/null +++ b/src/main/java/com/thealgorithms/strings/AhoCorasick.java @@ -0,0 +1,249 @@ +/* + * Aho-Corasick String Matching Algorithm Implementation + * + * This code implements the Aho-Corasick algorithm, which is used for efficient + * string matching in a given text. It can find multiple patterns simultaneously + * and records their positions in the text. + * + * Author: Prabhat-Kumar-42 + * GitHub: https://github.com/Prabhat-Kumar-42 + */ + +package com.thealgorithms.strings; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Queue; + +public final class AhoCorasick { + private AhoCorasick() { + } + + // Trie Node Class + private static class Node { + // Represents a character in the trie + private HashMap child = new HashMap<>(); // Child nodes of the current node + private Node suffixLink; // Suffix link to another node in the trie + private Node outputLink; // Output link to another node in the trie + private int patternInd; // Index of the pattern that ends at this node + + Node() { + this.suffixLink = null; + this.outputLink = null; + this.patternInd = -1; + } + + public HashMap getChild() { + return child; + } + + public Node getSuffixLink() { + return suffixLink; + } + + public void setSuffixLink(final Node suffixLink) { + this.suffixLink = suffixLink; + } + + public Node getOutputLink() { + return outputLink; + } + + public void setOutputLink(final Node outputLink) { + this.outputLink = outputLink; + } + + public int getPatternInd() { + return patternInd; + } + + public void setPatternInd(final int patternInd) { + this.patternInd = patternInd; + } + } + + // Trie Class + public static class Trie { + + private Node root = null; // Root node of the trie + private final String[] patterns; // patterns according to which Trie is constructed + + public Trie(final String[] patterns) { + root = new Node(); // Initialize the root of the trie + this.patterns = patterns; + buildTrie(); + buildSuffixAndOutputLinks(); + } + + // builds AhoCorasick Trie + private void buildTrie() { + + // Loop through each input pattern and building Trie + for (int i = 0; i < patterns.length; i++) { + Node curr = root; // Start at the root of the trie for each pattern + + // Loop through each character in the current pattern + for (int j = 0; j < patterns[i].length(); j++) { + char c = patterns[i].charAt(j); // Get the current character + + // Check if the current node has a child for the current character + if (curr.getChild().containsKey(c)) { + curr = curr.getChild().get(c); // Update the current node to the child node + } else { + // If no child node exists, create a new one and add it to the current node's children + Node nn = new Node(); + curr.getChild().put(c, nn); + curr = nn; // Update the current node to the new child node + } + } + curr.setPatternInd(i); // Store the index of the pattern in the current leaf node + } + } + + private void initializeSuffixLinksForChildNodesOfTheRoot(Queue q) { + for (char rc : root.getChild().keySet()) { + Node childNode = root.getChild().get(rc); + q.add(childNode); // Add child node to the queue + childNode.setSuffixLink(root); // Set suffix link to the root + } + } + + private void buildSuffixAndOutputLinks() { + root.setSuffixLink(root); // Initialize the suffix link of the root to itself + Queue q = new LinkedList<>(); // Initialize a queue for BFS traversal + + initializeSuffixLinksForChildNodesOfTheRoot(q); + + while (!q.isEmpty()) { + Node currentState = q.poll(); // Get the current node for processing + + // Iterate through child nodes of the current node + for (char cc : currentState.getChild().keySet()) { + Node currentChild = currentState.getChild().get(cc); // Get the child node + Node parentSuffix = currentState.getSuffixLink(); // Get the parent's suffix link + + // Calculate the suffix link for the child based on the parent's suffix link + while (!parentSuffix.getChild().containsKey(cc) && parentSuffix != root) { + parentSuffix = parentSuffix.getSuffixLink(); + } + + // Set the calculated suffix link or default to root + if (parentSuffix.getChild().containsKey(cc)) { + currentChild.setSuffixLink(parentSuffix.getChild().get(cc)); + } else { + currentChild.setSuffixLink(root); + } + + q.add(currentChild); // Add the child node to the queue for further processing + } + + // Establish output links for nodes to efficiently identify patterns within patterns + if (currentState.getSuffixLink().getPatternInd() >= 0) { + currentState.setOutputLink(currentState.getSuffixLink()); + } else { + currentState.setOutputLink(currentState.getSuffixLink().getOutputLink()); + } + } + } + + private ArrayList> initializePositionByStringIndexValue() { + ArrayList> positionByStringIndexValue = new ArrayList<>(patterns.length); // Stores positions where patterns are found in the text + for (int i = 0; i < patterns.length; i++) { + positionByStringIndexValue.add(new ArrayList()); + } + return positionByStringIndexValue; + } + + // Searches for patterns in the input text and records their positions + public ArrayList> searchIn(final String text) { + var positionByStringIndexValue = initializePositionByStringIndexValue(); // Initialize a list to store positions of the current pattern + Node parent = root; // Start searching from the root node + + PatternPositionRecorder positionRecorder = new PatternPositionRecorder(positionByStringIndexValue); + + for (int i = 0; i < text.length(); i++) { + char ch = text.charAt(i); // Get the current character in the text + + // Check if the current node has a child for the current character + if (parent.getChild().containsKey(ch)) { + parent = parent.getChild().get(ch); // Update the current node to the child node + positionRecorder.recordPatternPositions(parent, i); // Use the method in PatternPositionRecorder to record positions + } else { + // If no child node exists for the character, backtrack using suffix links + while (parent != root && !parent.getChild().containsKey(ch)) { + parent = parent.getSuffixLink(); + } + if (parent.getChild().containsKey(ch)) { + i--; // Decrement i to reprocess the same character + } + } + } + + setUpStartPoints(positionByStringIndexValue); + return positionByStringIndexValue; + } + + // by default positionByStringIndexValue contains end-points. This function converts those + // endpoints to start points + private void setUpStartPoints(ArrayList> positionByStringIndexValue) { + for (int i = 0; i < patterns.length; i++) { + for (int j = 0; j < positionByStringIndexValue.get(i).size(); j++) { + int endpoint = positionByStringIndexValue.get(i).get(j); + positionByStringIndexValue.get(i).set(j, endpoint - patterns[i].length() + 1); + } + } + } + } + + // Class to handle pattern position recording + private static class PatternPositionRecorder { + private ArrayList> positionByStringIndexValue; + + // Constructor to initialize the recorder with the position list + PatternPositionRecorder(final ArrayList> positionByStringIndexValue) { + this.positionByStringIndexValue = positionByStringIndexValue; + } + + /** + * Records positions for a pattern when it's found in the input text and follows + * output links to record positions of other patterns. + * + * @param parent The current node representing a character in the pattern trie. + * @param currentPosition The current position in the input text. + */ + public void recordPatternPositions(final Node parent, final int currentPosition) { + // Check if the current node represents the end of a pattern + if (parent.getPatternInd() > -1) { + // Add the current position to the list of positions for the found pattern + positionByStringIndexValue.get(parent.getPatternInd()).add(currentPosition); + } + + Node outputLink = parent.getOutputLink(); + // Follow output links to find and record positions of other patterns + while (outputLink != null) { + // Add the current position to the list of positions for the pattern linked by outputLink + positionByStringIndexValue.get(outputLink.getPatternInd()).add(currentPosition); + outputLink = outputLink.getOutputLink(); + } + } + } + // method to search for patterns in text + public static Map> search(final String text, final String[] patterns) { + final var trie = new Trie(patterns); + final var positionByStringIndexValue = trie.searchIn(text); + return convert(positionByStringIndexValue, patterns); + } + + // method for converting results to a map + private static Map> convert(final ArrayList> positionByStringIndexValue, final String[] patterns) { + Map> positionByString = new HashMap<>(); + for (int i = 0; i < patterns.length; i++) { + String pattern = patterns[i]; + ArrayList positions = positionByStringIndexValue.get(i); + positionByString.put(pattern, new ArrayList<>(positions)); + } + return positionByString; + } +} diff --git a/src/test/java/com/thealgorithms/strings/AhoCorasickTest.java b/src/test/java/com/thealgorithms/strings/AhoCorasickTest.java new file mode 100644 index 00000000..caaca561 --- /dev/null +++ b/src/test/java/com/thealgorithms/strings/AhoCorasickTest.java @@ -0,0 +1,120 @@ +/* + * Tests For Aho-Corasick String Matching Algorithm + * + * Author: Prabhat-Kumar-42 + * GitHub: https://github.com/Prabhat-Kumar-42 + */ + +package com.thealgorithms.strings; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Map; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * This class contains test cases for the Aho-Corasick String Matching Algorithm. + * The Aho-Corasick algorithm is used to efficiently find all occurrences of multiple + * patterns in a given text. + */ +class AhoCorasickTest { + private String[] patterns; // The array of patterns to search for + private String text; // The input text to search within + + /** + * This method sets up the test environment before each test case. + * It initializes the patterns and text to be used for testing. + */ + @BeforeEach + void setUp() { + patterns = new String[] {"ACC", "ATC", "CAT", "GCG", "C", "T"}; + text = "GCATCG"; + } + + /** + * Test searching for multiple patterns in the input text. + * The expected results are defined for each pattern. + */ + @Test + void testSearch() { + // Define the expected results for each pattern + final var expected = Map.of("ACC", new ArrayList<>(Arrays.asList()), "ATC", new ArrayList<>(Arrays.asList(2)), "CAT", new ArrayList<>(Arrays.asList(1)), "GCG", new ArrayList<>(Arrays.asList()), "C", new ArrayList<>(Arrays.asList(1, 4)), "T", new ArrayList<>(Arrays.asList(3))); + assertEquals(expected, AhoCorasick.search(text, patterns)); + } + + /** + * Test searching with an empty pattern array. + * The result should be an empty map. + */ + @Test + void testEmptyPatterns() { + // Define an empty pattern array + final var emptyPatterns = new String[] {}; + assertTrue(AhoCorasick.search(text, emptyPatterns).isEmpty()); + } + + /** + * Test searching for patterns that are not present in the input text. + * The result should be an empty list for each pattern. + */ + @Test + void testPatternNotFound() { + // Define patterns that are not present in the text + final var searchPatterns = new String[] {"XYZ", "123"}; + final var expected = Map.of("XYZ", new ArrayList(), "123", new ArrayList()); + assertEquals(expected, AhoCorasick.search(text, searchPatterns)); + } + + /** + * Test searching for patterns that start at the beginning of the input text. + * The expected position for each pattern is 0. + */ + @Test + void testPatternAtBeginning() { + // Define patterns that start at the beginning of the text + final var searchPatterns = new String[] {"GC", "GCA", "GCAT"}; + final var expected = Map.of("GC", new ArrayList(Arrays.asList(0)), "GCA", new ArrayList(Arrays.asList(0)), "GCAT", new ArrayList(Arrays.asList(0))); + assertEquals(expected, AhoCorasick.search(text, searchPatterns)); + } + + /** + * Test searching for patterns that end at the end of the input text. + * The expected positions are 4, 3, and 2 for the patterns. + */ + @Test + void testPatternAtEnd() { + // Define patterns that end at the end of the text + final var searchPatterns = new String[] {"CG", "TCG", "ATCG"}; + final var expected = Map.of("CG", new ArrayList(Arrays.asList(4)), "TCG", new ArrayList(Arrays.asList(3)), "ATCG", new ArrayList(Arrays.asList(2))); + assertEquals(expected, AhoCorasick.search(text, searchPatterns)); + } + + /** + * Test searching for patterns with multiple occurrences in the input text. + * The expected sizes are 1 and 1, and the expected positions are 2 and 3 + * for the patterns "AT" and "T" respectively. + */ + @Test + void testMultipleOccurrencesOfPattern() { + // Define patterns with multiple occurrences in the text + final var searchPatterns = new String[] {"AT", "T"}; + final var expected = Map.of("AT", new ArrayList(Arrays.asList(2)), "T", new ArrayList(Arrays.asList(3))); + assertEquals(expected, AhoCorasick.search(text, searchPatterns)); + } + + /** + * Test searching for patterns in a case-insensitive manner. + * The search should consider patterns regardless of their case. + */ + @Test + void testCaseInsensitiveSearch() { + // Define patterns with different cases + final var searchPatterns = new String[] {"gca", "aTc", "C"}; + final var expected = Map.of("gca", new ArrayList(), "aTc", new ArrayList(), "C", new ArrayList(Arrays.asList(1, 4))); + assertEquals(expected, AhoCorasick.search(text, searchPatterns)); + } +}