Given a string of length n of lowercase alphabet characters, we need to count total number of distinct substrings of this string.
Examples:
Input : str = "ababa" Output : 10 Total number of distinct substring are 10, which are, "", "a", "b", "ab", "ba", "aba", "bab", "abab", "baba" and "ababa"
We have discussed a Suffix Trie based solution in below post :
Count of distinct substrings of a string using Suffix Trie
We can solve this problem using suffix array and longest common prefix concept. A suffix array is a sorted array of all suffixes of a given string.
For string "ababa" suffixes are : "ababa", "baba", "aba", "ba", "a". After taking these suffixes in sorted form we get our suffix array as [4, 2, 0, 3, 1]
Then we calculate lcp array using kasai’s algorithm. For string "ababa", lcp array is [1, 3, 0, 2, 0]
After constructing both arrays, we calculate total number of distinct substring by keeping this fact in mind : If we look through the prefixes of each suffix of a string, we cover all substrings of that string.
We will explain the procedure for above example,
String = "ababa" Suffixes in sorted order : "a", "aba", "ababa", "ba", "baba" Initializing distinct substring count by length of first suffix, Count = length("a") = 1 Substrings taken in consideration : "a" Now we consider each consecutive pair of suffix, lcp("a", "aba") = "a". All characters that are not part of the longest common prefix contribute to a distinct substring. In the above case, they are 'b' and ‘a'. So they should be added to Count. Count += length("aba") - lcp("a", "aba") Count = 3 Substrings taken in consideration : "aba", "ab" Similarly for next pair also, Count += length("ababa") - lcp("aba", "ababa") Count = 5 Substrings taken in consideration : "ababa", "abab" Count += length("ba") - lcp("ababa", "ba") Count = 7 Substrings taken in consideration : "ba", "b" Count += length("baba") - lcp("ba", "baba") Count = 9 Substrings taken in consideration : "baba", "bab" We finally add 1 for empty string. count = 10
Implementation:
// C++ code to count total distinct substrings
// of a string
#include<bits/stdc++.h>
usingnamespacestd;
// Structure to store information of a suffix
structsuffix
{
intindex;// To store original index
intrank[2];// To store ranks and next
// rank pair
};
// A comparison function used by sort() to compare
// two suffixes. Compares two pairs, returns 1 if
// first pair is smaller
intcmp(structsuffixa,structsuffixb)
{
return(a.rank[0]==b.rank[0])?
(a.rank[1]<b.rank[1]?1:0):
(a.rank[0]<b.rank[0]?1:0);
}
// This is the main function that takes a string
// 'txt' of size n as an argument, builds and return
// the suffix array for the given string
vector<int>buildSuffixArray(stringtxt,intn)
{
// A structure to store suffixes and their indexes
structsuffixsuffixes[n];
// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort
// the suffixes alphabetically and maintain their
// old indexes while sorting
for(inti=0;i<n;i++)
{
suffixes[i].index=i;
suffixes[i].rank[0]=txt[i]-'a';
suffixes[i].rank[1]=((i+1)<n)?
(txt[i+1]-'a'):-1;
}
// Sort the suffixes using the comparison function
// defined above.
sort(suffixes,suffixes+n,cmp);
// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes
// according to first 4 characters, then first
// 8 and so on
intind[n];// This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.
for(intk=4;k<2*n;k=k*2)
{
// Assigning rank and index values to first suffix
intrank=0;
intprev_rank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
// Assigning rank to suffixes
for(inti=1;i<n;i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prev_rank&&
suffixes[i].rank[1]==suffixes[i-1].rank[1])
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else// Otherwise increment rank and assign
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
// Assign next rank to every suffix
for(inti=0;i<n;i++)
{
intnextindex=suffixes[i].index+k/2;
suffixes[i].rank[1]=(nextindex<n)?
suffixes[ind[nextindex]].rank[0]:-1;
}
// Sort the suffixes according to first k characters
sort(suffixes,suffixes+n,cmp);
}
// Store indexes of all sorted suffixes in the suffix
// array
vector<int>suffixArr;
for(inti=0;i<n;i++)
suffixArr.push_back(suffixes[i].index);
// Return the suffix array
returnsuffixArr;
}
/* To construct and return LCP */
vector<int>kasai(stringtxt,vector<int>suffixArr)
{
intn=suffixArr.size();
// To store LCP array
vector<int>lcp(n,0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the
// invSuff[5] would store 0. This is used to get next
// suffix string from suffix array.
vector<int>invSuff(n,0);
// Fill values in invSuff[]
for(inti=0;i<n;i++)
invSuff[suffixArr[i]]=i;
// Initialize length of previous LCP
intk=0;
// Process all suffixes one by one starting from
// first suffix in txt[]
for(inti=0;i<n;i++)
{
/* If the current suffix is at n-1, then we don’t
have next substring to consider. So lcp is not
defined for this substring, we put zero. */
if(invSuff[i]==n-1)
{
k=0;
continue;
}
/* j contains index of the next substring to
be considered to compare with the present
substring, i.e., next string in suffix array */
intj=suffixArr[invSuff[i]+1];
// Directly start matching from k'th index as
// at-least k-1 characters will match
while(i+k<n&&j+k<n&&txt[i+k]==txt[j+k])
k++;
lcp[invSuff[i]]=k;// lcp for the present suffix.
// Deleting the starting character from the string.
if(k>0)
k--;
}
// return the constructed lcp array
returnlcp;
}
// method to return count of total distinct substring
intcountDistinctSubstring(stringtxt)
{
intn=txt.length();
// calculating suffix array and lcp array
vector<int>suffixArr=buildSuffixArray(txt,n);
vector<int>lcp=kasai(txt,suffixArr);
// n - suffixArr[i] will be the length of suffix
// at ith position in suffix array initializing
// count with length of first suffix of sorted
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.size();i++)
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
result++;// For empty string
returnresult;
}
// Driver code to test above methods
intmain()
{
stringtxt="ababa";
cout<<countDistinctSubstring(txt);
return0;
}
/*package whatever //do not write package name here */
importjava.util.*;
class SuffiximplementsComparable<Suffix>{
intindex;
int[]rank=newint[2];
publicintcompareTo(Suffixs)
{
if(rank[0]==s.rank[0]){
returnInteger.compare(rank[1],s.rank[1]);
}
else{
returnInteger.compare(rank[0],s.rank[0]);
}
}
}
class Main{
staticint[]buildSuffixArray(Stringtxt,intn)
{
Suffix[]suffixes=newSuffix[n];
for(inti=0;i<n;i++){
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt.charAt(i)-'a';
suffixes[i].rank[1]
=(i+1)<n?txt.charAt(i+1)-'a'
:-1;
}
// Sort the suffixes
Arrays.sort(suffixes);
int[]ind=newint[n];
for(intk=4;k<2*n;k=k*2){
// Assigning rank and index values to first
// suffix
intrank=0;
intprevRank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
for(inti=1;i<n;i++){
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prevRank
&&suffixes[i].rank[1]
==suffixes[i-1].rank[1]){
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else{// Otherwise increment rank and
// assign
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
for(inti=0;i<n;i++){
intnextIndex=suffixes[i].index+k/2;
suffixes[i].rank[1]
=nextIndex<n
?suffixes[ind[nextIndex]].rank[0]
:-1;
}
Arrays.sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int[]suffixArr=newint[n];
for(inti=0;i<n;i++){
suffixArr[i]=suffixes[i].index;
}
returnsuffixArr;
}
staticint[]Const_LCP(Stringtxt,int[]suffixArr)
{
intn=suffixArr.length;
int[]lcp=newint[n];
int[]invSuff=newint[n];
for(inti=0;i<n;i++){
invSuff[suffixArr[i]]=i;
}
intk=0;
for(inti=0;i<n;i++){
if(invSuff[i]==n-1){
k=0;
continue;
}
intj=suffixArr[invSuff[i]+1];
while(i+k<n&&j+k<n
&&txt.charAt(i+k)
==txt.charAt(j+k)){
k++;
}
lcp[invSuff[i]]=k;
if(k>0){
k--;
}
}
returnlcp;
}
staticintcnt_Dist_Substr(Stringtxt)
{
intn=txt.length();
// calculating suffix array and lcp array
int[]suffixArr=buildSuffixArray(txt,n);
int[]lcp=Const_LCP(txt,suffixArr);
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.length;i++){
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
}
result++;// For empty string
returnresult;
}
publicstaticvoidmain(String[]args)
{
Stringtxt="ababa";
System.out.println(cnt_Dist_Substr(txt));
}
}
// This code is contributed by Jay
# Python code to count total distinct substrings
# of a string
# This is the main function that takes a string
# 'txt' of size n as an argument, builds and return
# the suffix array for the given string
def build_suffix_array(txt, n):
# Structure to store information of a suffix
class Suffix:
def __init__(self, index, rank):
self.index = index # To store original index
self.rank = rank # To store ranks and next rank pair
# Store suffixes and their indexes in an array
# of structures. The structure is needed to sort
# the suffixes alphabetically and maintain their
# old indexes while sorting
suffixes = [Suffix(i, [ord(txt[i])-ord('a'), ord(txt[i+1])-ord('a') if i+1 < n else -1]) for i in range(n)]
# Sort the suffixes using the comparison function
# defined above.
suffixes.sort(key=lambda x: x.rank)
# At his point, all suffixes are sorted according
# to first 2 characters. Let us sort suffixes
# according to first 4 characters, then first
# 8 and so on
ind = [0] * n
# This array is needed to get the
# index in suffixes[] from original
# index. This mapping is needed to get
# next suffix.
k = 4
while k < 2*n:
# Assigning rank and index values to first suffix
rank, prev_rank = 0, suffixes[0].rank[0]
suffixes[0].rank[0] = rank
ind[suffixes[0].index] = 0
# Assigning rank to suffixes
for i in range(1, n):
# If first rank and next ranks are same as
# that of previous suffix in array, assign
# the same new rank to this suffix
if suffixes[i].rank[0] == prev_rank and suffixes[i].rank[1] == suffixes[i-1].rank[1]:
prev_rank = suffixes[i].rank[0]
suffixes[i].rank[0] = rank
# Otherwise increment rank and assign
else:
prev_rank = suffixes[i].rank[0]
rank += 1
suffixes[i].rank[0] = rank
ind[suffixes[i].index] = i
# Assign next rank to every suffix
for i in range(n):
nextindex = suffixes[i].index + k//2
suffixes[i].rank[1] = suffixes[ind[nextindex]].rank[0] if nextindex < n else -1
# Sort the suffixes according to first k characters
suffixes.sort(key=lambda x: x.rank)
k *= 2
# Store indexes of all sorted suffixes in the suffix
# array
# Return the suffix array
return [suffix.index for suffix in suffixes]
# To construct and return LCP
def kasai(txt, suffixArr):
n = len(suffixArr)
# To store LCP array
lcp = [0] * n
# An auxiliary array to store inverse of suffix array
# elements. For example if suffixArr[0] is 5, the
# invSuff[5] would store 0. This is used to get next
# suffix string from suffix array.
invSuff = [0] * n
# Fill values in invSuff[]
for i in range(n):
invSuff[suffixArr[i]] = i
# Initialize length of previous LCP
k = 0
# Process all suffixes one by one starting from
# first suffix in txt[]
for i in range(n):
# If the current suffix is at n-1, then we don’t
# have next substring to consider. So lcp is not
# defined for this substring, we put zero
if invSuff[i] == n-1:
k = 0
continue
# j contains index of the next substring to
# be considered to compare with the present
# substring, i.e., next string in suffix array
j = suffixArr[invSuff[i]+1]
# Directly start matching from k'th index as
# at-least k-1 characters will match
while i+k < n and j+k < n and txt[i+k] == txt[j+k]:
k += 1
lcp[invSuff[i]] = k # lcp for the present suffix.
# Deleting the starting character from the string.
if k > 0:
k -= 1
# return the constructed lcp array
return lcp
# method to return count of total distinct substring
def count_distinct_substring(txt):
n = len(txt)
# calculating suffix array and lcp array
suffixArr = build_suffix_array(txt, n)
lcp = kasai(txt, suffixArr)
# n - suffixArr[i] will be the length of suffix
# at ith position in suffix array initializing
# count with length of first suffix of sorted
# suffixes
result = n - suffixArr[0]
for i in range(1, len(lcp)):
# subtract lcp from the length of suffix
result += (n - suffixArr[i]) - lcp[i-1]
result += 1 # For empty string
return result
# Driver code to test above methods
txt = "ababa"
print(count_distinct_substring(txt))
# This code is contributed by Aman Kumar
// C# code addition
usingSystem;
usingSystem.Linq;
classSuffix:IComparable<Suffix>
{
publicintindex;
publicint[]rank=newint[2];
publicintCompareTo(Suffixs)
{
if(rank[0]==s.rank[0])
{
returnrank[1].CompareTo(s.rank[1]);
}
else
{
returnrank[0].CompareTo(s.rank[0]);
}
}
}
classProgram
{
staticint[]buildSuffixArray(stringtxt,intn)
{
Suffix[]suffixes=newSuffix[n];
for(inti=0;i<n;i++)
{
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt[i]-'a';
suffixes[i].rank[1]=(i+1)<n?txt[i+1]-'a':-1;
}
// Sort the suffixes
Array.Sort(suffixes);
int[]ind=newint[n];
for(intk=4;k<2*n;k=k*2)
{
// Assigning rank and index values to first
// suffix
intrank=0;
intprevRank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
for(inti=1;i<n;i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prevRank
&&suffixes[i].rank[1]==suffixes[i-1].rank[1])
{
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else
{
// Otherwise increment rank and assign
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
for(inti=0;i<n;i++)
{
intnextIndex=suffixes[i].index+k/2;
suffixes[i].rank[1]=nextIndex<n?suffixes[ind[nextIndex]].rank[0]:-1;
}
Array.Sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int[]suffixArr=newint[n];
for(inti=0;i<n;i++)
{
suffixArr[i]=suffixes[i].index;
}
returnsuffixArr;
}
staticint[]Const_LCP(stringtxt,int[]suffixArr)
{
intn=suffixArr.Length;
int[]lcp=newint[n];
int[]invSuff=newint[n];
for(inti=0;i<n;i++)
{
invSuff[suffixArr[i]]=i;
}
intk=0;
for(inti=0;i<n;i++)
{
if(invSuff[i]==n-1)
{
k=0;
continue;
}
intj=suffixArr[invSuff[i]+1];
while(i+k<n&&j+k<n
&&txt[i+k]==txt[j+k])
{
k++;
}
lcp[invSuff[i]]=k;
if(k>0)
{
k--;
}
}
returnlcp;
}
staticintcnt_Dist_Substr(stringtxt)
{
intn=txt.Length;
// calculating suffix array and lcp array
int[]suffixArr=buildSuffixArray(txt,n);
int[]lcp=Const_LCP(txt,suffixArr);
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.Length;i++)
{
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
}
result++;// For empty string
returnresult;
}
staticvoidMain(){
Stringtxt="ababa";
Console.WriteLine(cnt_Dist_Substr(txt));
}
}
// The code is contributed by Arushi Goel.
// Javascript code to count total distinct substrings
// of a string
// This is the main function that takes a string
// 'txt' of size n as an argument, builds and return
// the suffix array for the given string
functionbuildSuffixArray(txt,n){
// Structure to store information of a suffix
classSuffix{
constructor(){
this.index=0;// To store original index
this.rank=[0,0];// To store ranks and next
// rank pair
}
}
// A comparison function used by sort() to compare
// two suffixes. Compares two pairs, returns 1 if
// first pair is smaller
functioncmp(a,b){
returna.rank[0]!==b.rank[0]
?a.rank[0]-b.rank[0]
:a.rank[1]-b.rank[1];
}
// A structure to store suffixes and their indexes
letsuffixes=newArray(n);
// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort
// the suffixes alphabetically and maintain their
// old indexes while sorting
for(leti=0;i<n;i++){
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt.charCodeAt(i)-"a".charCodeAt(0);
suffixes[i].rank[1]=
i+1<n?txt.charCodeAt(i+1)-"a".charCodeAt(0):-1;
}
// Sort the suffixes using the comparison function
// defined above.
suffixes.sort((a,b)=>cmp(a,b));
// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes
// according to first 4 characters, then first
// 8 and so on
letind=newArray(n);// This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.
for(letk=4;k<2*n;k*=2){
// Assigning rank and index values to first suffix
letrank=0;
letprev_rank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
// Assigning rank to suffixes
for(leti=1;i<n;i++){
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(
suffixes[i].rank[0]===prev_rank&&
suffixes[i].rank[1]===suffixes[i-1].rank[1]
){
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}else// Otherwise increment rank and assign
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
// Assign next rank to every suffix
for(leti=0;i<n;i++){
letnextindex=suffixes[i].index+k/2;
suffixes[i].rank[1]=
nextindex<n?suffixes[ind[nextindex]].rank[0]:-1;
}
// Sort the suffixes according to first k characters
suffixes.sort(cmp);
}
// Store indexes of all sorted suffixes in the suffix
// array
letsuffixArr=newArray(n);
for(leti=0;i<n;i++)suffixArr[i]=suffixes[i].index;
// Return the suffix array
returnsuffixArr;
}
/* To construct and return LCP */
functionkasai(txt,suffixArr){
letn=suffixArr.length;
// To store LCP array
letlcp=newArray(n).fill(0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the
// invSuff[5] would store 0. This is used to get next
// suffix string from suffix array.
letinvSuff=newArray(n).fill(0);
// Fill values in invSuff[]
for(leti=0;i<n;i++)invSuff[suffixArr[i]]=i;
letk=0;
// Process all suffixes one by one starting from
// first suffix in txt[]
for(leti=0;i<n;i++){
/* If the current suffix is at n-1, then we don’t
have next substring to consider. So lcp is not
defined for this substring, we put zero. */
if(invSuff[i]==n-1){
k=0;
continue;
}
/* j contains index of the next substring to
be considered to compare with the present
substring, i.e., next string in suffix array */
letj=suffixArr[invSuff[i]+1];
// Directly start matching from k'th index as
// at-least k-1 characters will match
while(i+k<n&&j+k<n&&txt[i+k]===txt[j+k])k++;
lcp[invSuff[i]]=k;// lcp for the present suffix.\
// Deleting the starting character from the string.
if(k>0)k--;
}
// return the constructed lcp array
returnlcp;
}
// method to return count of total distinct substring
functioncountDistinctSubstring(txt){
letn=txt.length;
// calculating suffix array and lcp array
letsuffixArr=buildSuffixArray(txt,n);
letlcp=kasai(txt,suffixArr);
// n - suffixArr[i] will be the length of suffix
// at ith position in suffix array initializing
// count with length of first suffix of sorted
// suffixes
letresult=n-suffixArr[0];
for(leti=1;i<lcp.length;i++)
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
result++;// For empty string
returnresult;
}
// Driver code to test above methods
lettxt="ababa";
console.log(countDistinctSubstring(txt));
// This code is contributed by Utkarsh Kumar.
10
Time Complexity : O(nlogn), where n is the length of string.
Auxiliary Space : O(n), where n is the length of string.
This article is contributed by Utkarsh Trivedi<.
K
Array Data Structure
String in Data Structure
Hashing in Data Structure
Linked List Data Structure
Stack Data Structure
Queue Data Structure
Tree Data Structure
Graph Data Structure
Trie Data Structure
Searching Algorithms
Sorting Algorithms
Introduction to Recursion
Greedy Algorithms
Graph Algorithms
Dynamic Programming or DP
Bitwise Algorithms
Segment Tree
Binary Indexed Tree or Fenwick Tree
Square Root (Sqrt) Decomposition Algorithm
Binary Lifting
Geometry