JetBrains Academy: Rabin Karp algorithm in Java - Kamil-Jankowski/Learning-JAVA GitHub Wiki

JetBrains Academy: Rabin-Karp algorithm in Java

The number of distinct substrings:

Given a string s. Write a program that counts the number of distinct substrings in s. In this problem for a polynomial hash with constants a = 53 and m = 109 + 9. It is guaranteed that if hash values for two substrings are equal, then the strings are equal as well.

NB: Remember about the empty string.

import java.util.*;

public class Main {
    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);
        String text = scanner.nextLine();
        
        int result = RabinKarpClass.countDistinctSubstrings(text);
        System.out.println(result);
    }
}

class RabinKarpClass {
    public static long charToLong(char ch) {
        return (long) (ch - 'A' + 1);            // ch - 65 + 1
    }

    public static int countDistinctSubstrings(String text) {
        int a = 53;
        long m = 1_000_000_000 + 9;
        Set<Long> hashValues = new HashSet<>();
        hashValues.add(-1L);                    // adding emptyString

        for (int i = 0; i < text.length(); i++) {
            String substring = text.substring(i);
            long hash = 0;
            long pow = 1;

            for (int j = 0; j < substring.length(); j++) {
                hash += charToLong(substring.charAt(j)) * pow;
                hash %= m;

                if (j != substring.length() - 1) {
                    pow = pow * a % m;
                }
                hashValues.add(hash);
            }
        }
        return hashValues.size();
    }
}

Repetitive Substring:

Given a string s and an integer k. Write a program that checks whether there is a substring of length k that appears in s at least twice.

Input: the first line contains a string s such that āˆ£sāˆ£ ā‰¤ 6ā‹…106. The second line contains an integer k ā‰¤ 500.

Output: if s contains a substring of length k repeating at least twice, print this substring. Otherwise, output "does not contain".

import java.io.*;
import java.util.*;

class Main {
    public static void main(String[] args) {
        String answer;
        String text;
        int k;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
            text = reader.readLine();
            k = Integer.parseInt(reader.readLine());
            
            answer = RabinKarpClass.repetitiveSubstring(text, k);
            System.out.println(answer);
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

class RabinKarpClass {
    private static final int A = 53;
    private static final int B = 97;
    private static final long M = 1_000_000_000 + 9;
    
    private static long charToLong(char ch) {
        return (long) (ch - 'A' + 1);    // ch - 65 + 1
    }
    
    public static String repetitiveSubstring(String text, int length) {
        final String defaultAnswer = "does not contain";
        long currSubstrHashA = 0;
        long currSubstrHashB = 0;
        long powA = 1;
        long powB = 1;
        String repetitive = "";
        boolean matchFound = false;
        Map<Long, List<Long>> substrings = new HashMap<>();


        // hash the first substring from text of length "length"
        for (int i = 0; i < length; i++) {
            char currentCharacter = text.charAt(text.length() - length + i);

            // first hash
            currSubstrHashA += charToLong(currentCharacter) * powA;
            currSubstrHashA %= M;

            if (i != length - 1) {
                powA = powA * A % M;
            }

            // doubled hash
            currSubstrHashB += charToLong(currentCharacter) * powB;
            currSubstrHashB %= M;

            if (i != length - 1) {
                powB = powB * B % M;
            }
        }
        substrings.put(currSubstrHashA, new ArrayList<>());
        substrings.get(currSubstrHashA).add(currSubstrHashB);

        for (int i = text.length(); i >= length; i--) {
            if (i > length) {
                // using rolling hash, compare and add next hashes to Map if no match
                currSubstrHashA = (currSubstrHashA - charToLong(text.charAt(i - 1)) * powA % M + M) * A % M;
                currSubstrHashA = (currSubstrHashA + charToLong(text.charAt(i - length - 1))) % M;

                currSubstrHashB = (currSubstrHashB - charToLong(text.charAt(i - 1)) * powB % M + M) * B % M;
                currSubstrHashB = (currSubstrHashB + charToLong(text.charAt(i - length - 1))) % M;

                if (substrings.containsKey(currSubstrHashA)) {
                    // when match is found, save substring into variable
                    if (substrings.get(currSubstrHashA).contains(currSubstrHashB)) {
                        matchFound = true;
                        repetitive = text.substring(i - length - 1, i - 1);
                        break;
                    } else {
                        matchFound = false;
                        substrings.get(currSubstrHashA).add(currSubstrHashB);
                    }
                } else {
                    matchFound = false;
                    substrings.put(currSubstrHashA, new ArrayList<>());
                    substrings.get(currSubstrHashA).add(currSubstrHashB);
                }
            }
        }

        return matchFound ? repetitive : defaultAnswer;
    } 
}

A substring of the maximal length:

Given a string s. Write a program that finds a substring of the maximal length that appears in s at least twice.

Solution (memory inefficient):

import java.io.*;

public class Main {
    public static void main(String[] args) {
        String text;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
            text = reader.readLine();

            int longestRepeatedSubstring = RabinKarpClass.longestRepeatedSubstring(text);

            System.out.println(longestRepeatedSubstring);
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

class RabinKarpClass {
    
    public static int longestRepeatedSubstring(String text) {
        int length = text.length();
        String[] suffixes = new String[length];

        for (int i = 0; i < length; i++) {
            suffixes[i] = text.substring(i);
        }

        MergeSort.sortString(suffixes, 0, length);

        String lrs = "";

        for (int i = 0; i < length - 1; i++) {
            String common = longestCommonPrefix(suffixes[i], suffixes[i + 1]);
            if (common.length() > lrs.length()) {
                lrs = common;
            }
        }
        return lrs.length();
    }

    public static String longestCommonPrefix(String s, String t) {
        int n = Math.min(s.length(), t.length());
        for (int i = 0; i < n; i++) {
            if (s.charAt(i) != t.charAt(i)) {
                return s.substring(0, i);
            }
        }
        return s.substring(0, n);
    }
}

class MergeSort {
    
    public static void sortString(String[] array, int leftIncl, int rightExcl) {
        // the base case: if subarray contains <= 1 items, stop dividing because it's sorted
        if (rightExcl <= leftIncl + 1) {
            return;
        }

        /* divide: calculate the index of the middle element */
        int middle = leftIncl + (rightExcl - leftIncl) / 2;

        sortString(array, leftIncl, middle);  // conquer: sort the left subarray
        sortString(array, middle, rightExcl); // conquer: sort the right subarray

        /* combine: merge both sorted subarrays into sorted one */
        combine(array, leftIncl, middle, rightExcl);
    }

    private static void combine(String[] array, int left, int middle, int right) {
        int i = left;   // index for the left subarray
        int j = middle; // index for the right subarray
        int k = 0;      // index for the temp subarray

        String[] temp = new String[right - left]; // temporary array for merging

        /* get the next lesser element from one of two subarrays
        and then insert it in the array until one of the subarrays is empty */
        while (i < middle && j < right) {
            if (array[i].compareTo(array[j]) > 0) {
                temp[k] = array[j];
                j++;
            } else {
                temp[k] = array[i];
                i++;
            }
            k++;
        }

        /* insert all the remaining elements of the left subarray in the array */
        for (;i < middle; i++, k++) {
            temp[k] = array[i];
        }

        /* insert all the remaining elements of the right subarray in the array */
        for (;j < right; j++, k++) {
            temp[k] = array[j];
        }

        /* effective copying elements from temp to array */
        System.arraycopy(temp, 0, array, left, temp.length);
    }
}

or

Solution (time inefficient):

import java.io.*;
import java.util.*;

public class Main {
    public static void main(String[] args) {
        String text;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
            text = reader.readLine();

            int longestRepeatedSubstring = RabinKarpClass.longestRepeatedSubstringHashed(text);

            System.out.println(longestRepeatedSubstring);
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

class RabinKarpClass {
    
    private static long charToLong(char ch) {
        return (long)(ch - 'A' + 1);    // ch - 65 + 1
    }
    
    private static long hash(String s) {
        int a = 53;
        long m = 1_000_000_000 + 9;

        long hash = 0;
        long pow = 1;

        for (int i = 0; i < s.length(); i++) {
            hash += charToLong(s.charAt(i)) * pow;
            hash %= m;

            if (i != s.length() - 1) {
                pow = pow * a % m;
            }
        }
        return hash;
    }

    private static long hash2(String s) {
        int a = 97;
        long m = 1_000_000_000 + 9;

        long hash = 0;
        long pow = 1;

        for (int i = 0; i < s.length(); i++) {
            hash += charToLong(s.charAt(i)) * pow;
            hash %= m;

            if (i != s.length() - 1) {
                pow = pow * a % m;
            }
        }
        return hash;
    }
    
    static int longestRepeatedSubstringHashed(String text) {
        int lrsLength = 0;
        boolean matchFound = false;
        // binary search of number from 1 to text.length() which will be the length of the longest substring
        int left = 0;
        int right = text.length() - 1;

        while (left <= right) {
            int mid = left + (right - left) / 2;
            Map<Long, List<Long>> doubleHashed = new HashMap<>();
            // hashing values of substrings of length currently being checked, placing them inside the HashMap,
            for (int i = 0; i <= text.length() - mid; i++) {
                String substring = text.substring(i, i + mid);
                long key = hash(substring);
                long value = hash2(substring);

                if (doubleHashed.containsKey(key)){
                    // if match is found ie. object is already in the Map save it to lrs and its length to lrsLength
                    if (doubleHashed.get(key).contains(value)) {
                        //match found
                        lrsLength = mid;
                        matchFound = true;
                        break;
                    } else {
                        doubleHashed.get(key).add(value);
                        matchFound = false;
                    }
                } else {
                    List<Long> temp = new ArrayList<>();
                    temp.add(value);
                    doubleHashed.put(key, temp);
                    matchFound = false;
                }
            }

            if (matchFound) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }

        return lrsLength;
    }
}

or

TBD

Calculating a substring hash:

The polynomial hash function has the following property: if we know hash values for all prefixes of some string, we can find a hash value for any substring of this string in O(1)O(1). In other words, if we precalculate hashes for prefixes of some string only once, we will get a fast way to compute a hash for arbitrary substrings of this string. Let's now learn the details of how the approach works.

Consider a string s = s0s1...sn-1. A polynomial hash for a substring s[i : j] can be calculated as follows:

hP(s[i : j]) = (si ā‹… a0 + si+1 ā‹… a1 + ... + sjāˆ’1 ā‹… ajāˆ’iāˆ’1) mod m

If we multiply the expression by ai, we will get the following:

hP(s[i : j]) ā‹… ai = (si ā‹… ai + si+1 ā‹… ai+1 + ... + sjāˆ’1 ā‹… ajāˆ’1) mod m = (āˆ‘<k=i, k=j-1> sk ā‹… ak) mod m

The last sum can be rewritten as a difference of two sums, each being a polynomial hash for a prefix of s:

hP(s[i : j]) ā‹… ai = (āˆ‘<k=i, k=j-1> sk ā‹… ak) mod m = ((āˆ‘<k=0, k=j-1> sk ā‹… ak) - (āˆ‘<k=0, k=i-1> sk ā‹… ak)) mod m

Using the slice notation, the expression above can be written as follows:

hP(s[i : j]) ā‹… ai = hP(s[0 : j]) - hP(s[0 : i]) mod m

This formula gives us an explicit way to calculate a hash value for any substring of s in O(1) given that hashes for all prefixes are precalculated.

In this task, you need to apply the described approach to calculate hash values for given substrings of some string. One thing to keep in mind here is that we got hash value multiplied by ai. Thus, to get the hash value itself, we need to perform a modulo division by ai. For the sake of simplicity, we will avoid this step here and ask you to calculate hash values with this additional multiplier.

Input: the first line contains a string s. The second one contains an integer k. Each of the following k lines contains a pair of indexes i and j separated by space, such that 0 ā‰¤ i < j ā‰¤ āˆ£sāˆ£.

Output: the first line should contain hash values for all prefixes of s. The second line should contain k integers, each equal to hP(s[i : j]) ā‹… ai for the given in the input indexes i and j. Note that all hash values have to be non-negative.

In this problem, you are expected to use a polynomial hash with the following parameters: a = 53 and m = 109+9.

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

public class Main {

    public static void main(String[] args) {
        String text;
        int lines;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
            text = reader.readLine();
            lines = Integer.parseInt(reader.readLine());

            List<Long> hashValues = RabinKarpClass.substringHash(text);
            List<Long> substringHashes = new ArrayList<>();

            for (int k = 0; k < lines; k++) {
                int[] line = Arrays.stream(reader.readLine().split("\\s+")).mapToInt(Integer::parseInt).toArray();
                int i = line[0];
                int j = line[1];

                long hash = RabinKarpClass.indexedSubstringHash(hashValues, i, j);
                substringHashes.add(hash);
            }

            String hashes = hashValues.stream().map(String::valueOf).skip(1).collect(Collectors.joining(" "));
            String substrings = substringHashes.stream().map(String::valueOf).collect(Collectors.joining(" "));
            System.out.println(hashes);
            System.out.println(substrings);

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

class RabinKarpClass {
    // parameters for calculating hash values
    private static int a = 53;
    private static long m = 1_000_000_000 + 9;
    
    private static long charToLong(char ch) {
        return (long) (ch - 'A' + 1);    // ch - 65 + 1
    }
    
    public static List<Long> substringHash(String s) {
        List<Long> substringHashValues = new ArrayList<>();
        substringHashValues.add(0L);    // empty string hash value

        long substringHash = 0;
        long pow = 1;

        // 1. Calculating hash values for all prefixes of String s
        for (int i = 0; i < s.length(); i++) {
            substringHash += charToLong(s.charAt(i)) * pow;
            substringHash %= m;

            if (i != s.length() - 1) {
                pow = pow * a % m;
            }
            substringHashValues.add(substringHash);
        }

        return substringHashValues;
    }

    public static long indexedSubstringHash(List<Long> hashValues, int i, int j) {
        return (hashValues.get(j) - hashValues.get(i) + m) % m;
    }
}

The number of equal substring pairs:

Given a string s and a set of pairs, where each element represents the starting and the ending position of a substring of s. Write a program that counts the number of pairs of equal substrings.

Input: the first line contains a string s. The second line contains an integer t. Each of the following t lines contains 4 integers i, j, k, and n separated by space, such that 0 ā‰¤ i < j ā‰¤ āˆ£sāˆ£, 0 ā‰¤ k < n ā‰¤ āˆ£sāˆ£.

Output: the number of pairs, such that s[i:j] is equal to s[k:n].

In this problem for a polynomial hash with constants a = 53 and m = 109 + 9. It is guaranteed that if hash values for two substrings are equal, then the strings are equal as well.

Solution (memory inefficient):

import java.io.*;
import java.util.*;

public class Main {

    public static void main(String[] args) {
        String text;
        int lines;
        int counter = 0;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
            text = reader.readLine();
            lines = Integer.parseInt(reader.readLine());

            for (int i = 0; i < lines; i++) {
                int[] line = Arrays.stream(reader.readLine().split("\\s+")).mapToInt(Integer::parseInt).toArray();

                String substring_ij = text.substring(line[0], line[1]);
                String substring_kn = text.substring(line[2], line[3]);
                long hash_ij = hash(substring_ij);
                long hash_kn = hash(substring_kn);

                if (hash_ij == hash_kn) {
                    counter ++;
                }
            }

            System.out.println(counter);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

class RabinKarpClass {
    public static long charToLong(char ch) {
        return (long) (ch - 'A' + 1);            // ch - 65 + 1
    }

    public static long hash(String s) {
        int a = 53;
        long m = 1_000_000_000 + 9;

        long hash = 0;
        long pow = 1;

        for (int i = 0; i < s.length(); i++) {
            hash += charToLong(s.charAt(i)) * pow;
            hash %= m;

            if (i != s.length() - 1) {
                pow = pow * a % m;
            }
        }
        return hash;
    }
}

or

TBD

āš ļø **GitHub.com Fallback** āš ļø