Count of distinct substrings of a string using Suffix Array

Given a string of length n of lowercase alphabet characters, we need to count total number of distinct substrings of this string.

Examples:

Input : str = "ababa"
Output : 10
Total number of distinct substring are 10, which are,
"", "a", "b", "ab", "ba", "aba", "bab", "abab", "baba"
and "ababa"

We have discussed a Suffix Trie based solution in below post :
Count of distinct substrings of a string using Suffix Trie

We can solve this problem using suffix array and longest common prefix concept. A suffix array is a sorted array of all suffixes of a given string.
For string "ababa" suffixes are : "ababa", "baba", "aba", "ba", "a". After taking these suffixes in sorted form we get our suffix array as [4, 2, 0, 3, 1]
Then we calculate lcp array using kasai’s algorithm. For string "ababa", lcp array is [1, 3, 0, 2, 0]
After constructing both arrays, we calculate total number of distinct substring by keeping this fact in mind : If we look through the prefixes of each suffix of a string, we cover all substrings of that string.

We will explain the procedure for above example,

String = "ababa"
Suffixes in sorted order : "a", "aba", "ababa",
 "ba", "baba"
Initializing distinct substring count by length
of first suffix, 
Count = length("a") = 1 
Substrings taken in consideration : "a"
Now we consider each consecutive pair of suffix, 
lcp("a", "aba") = "a".
All characters that are not part of the longest 
common prefix contribute to a distinct substring. 
In the above case, they are 'b' and ‘a'. So they 
should be added to Count.
Count += length("aba") - lcp("a", "aba") 
Count = 3 
Substrings taken in consideration : "aba", "ab"
Similarly for next pair also,
Count += length("ababa") - lcp("aba", "ababa")
Count = 5
Substrings taken in consideration : "ababa", "abab"
Count += length("ba") - lcp("ababa", "ba")
Count = 7
Substrings taken in consideration : "ba", "b"
Count += length("baba") - lcp("ba", "baba")
Count = 9
Substrings taken in consideration : "baba", "bab"
We finally add 1 for empty string.
count = 10

Implementation:

CPP

// C++ code to count total distinct substrings
// of a string
#include<bits/stdc++.h>
usingnamespacestd;
// Structure to store information of a suffix
structsuffix
{
intindex;// To store original index
intrank[2];// To store ranks and next
// rank pair
};
// A comparison function used by sort() to compare
// two suffixes. Compares two pairs, returns 1 if
// first pair is smaller
intcmp(structsuffixa,structsuffixb)
{
return(a.rank[0]==b.rank[0])?
(a.rank[1]<b.rank[1]?1:0):
(a.rank[0]<b.rank[0]?1:0);
}
// This is the main function that takes a string
// 'txt' of size n as an argument, builds and return
// the suffix array for the given string
vector<int>buildSuffixArray(stringtxt,intn)
{
// A structure to store suffixes and their indexes
structsuffixsuffixes[n];
// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort
// the suffixes alphabetically and maintain their
// old indexes while sorting
for(inti=0;i<n;i++)
{
suffixes[i].index=i;
suffixes[i].rank[0]=txt[i]-'a';
suffixes[i].rank[1]=((i+1)<n)?
(txt[i+1]-'a'):-1;
}
// Sort the suffixes using the comparison function
// defined above.
sort(suffixes,suffixes+n,cmp);
// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes
// according to first 4 characters, then first
// 8 and so on
intind[n];// This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.
for(intk=4;k<2*n;k=k*2)
{
// Assigning rank and index values to first suffix
intrank=0;
intprev_rank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
// Assigning rank to suffixes
for(inti=1;i<n;i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prev_rank&&
suffixes[i].rank[1]==suffixes[i-1].rank[1])
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else// Otherwise increment rank and assign
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
// Assign next rank to every suffix
for(inti=0;i<n;i++)
{
intnextindex=suffixes[i].index+k/2;
suffixes[i].rank[1]=(nextindex<n)?
suffixes[ind[nextindex]].rank[0]:-1;
}
// Sort the suffixes according to first k characters
sort(suffixes,suffixes+n,cmp);
}
// Store indexes of all sorted suffixes in the suffix
// array
vector<int>suffixArr;
for(inti=0;i<n;i++)
suffixArr.push_back(suffixes[i].index);
// Return the suffix array
returnsuffixArr;
}
/* To construct and return LCP */
vector<int>kasai(stringtxt,vector<int>suffixArr)
{
intn=suffixArr.size();
// To store LCP array
vector<int>lcp(n,0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the
// invSuff[5] would store 0. This is used to get next
// suffix string from suffix array.
vector<int>invSuff(n,0);
// Fill values in invSuff[]
for(inti=0;i<n;i++)
invSuff[suffixArr[i]]=i;
// Initialize length of previous LCP
intk=0;
// Process all suffixes one by one starting from
// first suffix in txt[]
for(inti=0;i<n;i++)
{
/* If the current suffix is at n-1, then we don’t
 have next substring to consider. So lcp is not
 defined for this substring, we put zero. */
if(invSuff[i]==n-1)
{
k=0;
continue;
}
/* j contains index of the next substring to
 be considered to compare with the present
 substring, i.e., next string in suffix array */
intj=suffixArr[invSuff[i]+1];
// Directly start matching from k'th index as
// at-least k-1 characters will match
while(i+k<n&&j+k<n&&txt[i+k]==txt[j+k])
k++;
lcp[invSuff[i]]=k;// lcp for the present suffix.
// Deleting the starting character from the string.
if(k>0)
k--;
}
// return the constructed lcp array
returnlcp;
}
// method to return count of total distinct substring
intcountDistinctSubstring(stringtxt)
{
intn=txt.length();
// calculating suffix array and lcp array
vector<int>suffixArr=buildSuffixArray(txt,n);
vector<int>lcp=kasai(txt,suffixArr);
// n - suffixArr[i] will be the length of suffix
// at ith position in suffix array initializing
// count with length of first suffix of sorted
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.size();i++)
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
result++;// For empty string
returnresult;
}
// Driver code to test above methods
intmain()
{
stringtxt="ababa";
cout<<countDistinctSubstring(txt);
return0;
}

Java

/*package whatever //do not write package name here */
importjava.util.*;
class SuffiximplementsComparable<Suffix>{
intindex;
int[]rank=newint[2];
publicintcompareTo(Suffixs)
{
if(rank[0]==s.rank[0]){
returnInteger.compare(rank[1],s.rank[1]);
}
else{
returnInteger.compare(rank[0],s.rank[0]);
}
}
}
class Main{
staticint[]buildSuffixArray(Stringtxt,intn)
{
Suffix[]suffixes=newSuffix[n];
for(inti=0;i<n;i++){
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt.charAt(i)-'a';
suffixes[i].rank[1]
=(i+1)<n?txt.charAt(i+1)-'a'
:-1;
}
// Sort the suffixes
Arrays.sort(suffixes);
int[]ind=newint[n];
for(intk=4;k<2*n;k=k*2){
// Assigning rank and index values to first
// suffix
intrank=0;
intprevRank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
for(inti=1;i<n;i++){
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prevRank
&&suffixes[i].rank[1]
==suffixes[i-1].rank[1]){
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else{// Otherwise increment rank and
// assign
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
for(inti=0;i<n;i++){
intnextIndex=suffixes[i].index+k/2;
suffixes[i].rank[1]
=nextIndex<n
?suffixes[ind[nextIndex]].rank[0]
:-1;
}
Arrays.sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int[]suffixArr=newint[n];
for(inti=0;i<n;i++){
suffixArr[i]=suffixes[i].index;
}
returnsuffixArr;
}
staticint[]Const_LCP(Stringtxt,int[]suffixArr)
{
intn=suffixArr.length;
int[]lcp=newint[n];
int[]invSuff=newint[n];
for(inti=0;i<n;i++){
invSuff[suffixArr[i]]=i;
}
intk=0;
for(inti=0;i<n;i++){
if(invSuff[i]==n-1){
k=0;
continue;
}
intj=suffixArr[invSuff[i]+1];
while(i+k<n&&j+k<n
&&txt.charAt(i+k)
==txt.charAt(j+k)){
k++;
}
lcp[invSuff[i]]=k;
if(k>0){
k--;
}
}
returnlcp;
}
staticintcnt_Dist_Substr(Stringtxt)
{
intn=txt.length();
// calculating suffix array and lcp array
int[]suffixArr=buildSuffixArray(txt,n);
int[]lcp=Const_LCP(txt,suffixArr);
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.length;i++){
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
}
result++;// For empty string
returnresult;
}
publicstaticvoidmain(String[]args)
{
Stringtxt="ababa";
System.out.println(cnt_Dist_Substr(txt));
}
}
// This code is contributed by Jay

Python3

# Python code to count total distinct substrings
# of a string
# This is the main function that takes a string
# 'txt' of size n as an argument, builds and return
# the suffix array for the given string
def build_suffix_array(txt, n):
 # Structure to store information of a suffix
 class Suffix:
 def __init__(self, index, rank):
 self.index = index # To store original index
 self.rank = rank # To store ranks and next rank pair
 
 # Store suffixes and their indexes in an array
 # of structures. The structure is needed to sort
 # the suffixes alphabetically and maintain their
 # old indexes while sorting
 suffixes = [Suffix(i, [ord(txt[i])-ord('a'), ord(txt[i+1])-ord('a') if i+1 < n else -1]) for i in range(n)]
 
 # Sort the suffixes using the comparison function
 # defined above.
 suffixes.sort(key=lambda x: x.rank)
 
 # At his point, all suffixes are sorted according
 # to first 2 characters. Let us sort suffixes
 # according to first 4 characters, then first
 # 8 and so on
 ind = [0] * n
 # This array is needed to get the
 # index in suffixes[] from original
 # index. This mapping is needed to get
 # next suffix.
 k = 4
 while k < 2*n:
 # Assigning rank and index values to first suffix
 rank, prev_rank = 0, suffixes[0].rank[0]
 suffixes[0].rank[0] = rank
 ind[suffixes[0].index] = 0
 # Assigning rank to suffixes
 for i in range(1, n):
 # If first rank and next ranks are same as
 # that of previous suffix in array, assign
 # the same new rank to this suffix
 if suffixes[i].rank[0] == prev_rank and suffixes[i].rank[1] == suffixes[i-1].rank[1]:
 prev_rank = suffixes[i].rank[0]
 suffixes[i].rank[0] = rank
 # Otherwise increment rank and assign
 else:
 prev_rank = suffixes[i].rank[0]
 rank += 1
 suffixes[i].rank[0] = rank
 ind[suffixes[i].index] = i
 
 # Assign next rank to every suffix
 for i in range(n):
 nextindex = suffixes[i].index + k//2
 suffixes[i].rank[1] = suffixes[ind[nextindex]].rank[0] if nextindex < n else -1
 
 # Sort the suffixes according to first k characters
 suffixes.sort(key=lambda x: x.rank)
 k *= 2
 # Store indexes of all sorted suffixes in the suffix
 # array
 # Return the suffix array
 return [suffix.index for suffix in suffixes]
# To construct and return LCP
def kasai(txt, suffixArr):
 n = len(suffixArr)
 
 # To store LCP array
 lcp = [0] * n
 
 # An auxiliary array to store inverse of suffix array
 # elements. For example if suffixArr[0] is 5, the
 # invSuff[5] would store 0. This is used to get next
 # suffix string from suffix array.
 invSuff = [0] * n
 
 # Fill values in invSuff[]
 for i in range(n):
 invSuff[suffixArr[i]] = i
 
 # Initialize length of previous LCP
 k = 0
 
 # Process all suffixes one by one starting from
 # first suffix in txt[]
 for i in range(n):
 # If the current suffix is at n-1, then we don’t
 # have next substring to consider. So lcp is not
 # defined for this substring, we put zero
 if invSuff[i] == n-1:
 k = 0
 continue
 
 # j contains index of the next substring to
 # be considered to compare with the present
 # substring, i.e., next string in suffix array
 j = suffixArr[invSuff[i]+1]
 
 # Directly start matching from k'th index as
 # at-least k-1 characters will match
 while i+k < n and j+k < n and txt[i+k] == txt[j+k]:
 k += 1
 lcp[invSuff[i]] = k # lcp for the present suffix.
 
 # Deleting the starting character from the string.
 if k > 0:
 k -= 1
 # return the constructed lcp array
 return lcp
# method to return count of total distinct substring
def count_distinct_substring(txt):
 n = len(txt)
 # calculating suffix array and lcp array
 suffixArr = build_suffix_array(txt, n)
 lcp = kasai(txt, suffixArr)
 
 # n - suffixArr[i] will be the length of suffix
 # at ith position in suffix array initializing
 # count with length of first suffix of sorted
 # suffixes
 result = n - suffixArr[0]
 
 for i in range(1, len(lcp)):
 # subtract lcp from the length of suffix
 result += (n - suffixArr[i]) - lcp[i-1]
 
 result += 1 # For empty string
 return result
# Driver code to test above methods
txt = "ababa"
print(count_distinct_substring(txt))
# This code is contributed by Aman Kumar

// C# code addition 
usingSystem;
usingSystem.Linq;
classSuffix:IComparable<Suffix>
{
publicintindex;
publicint[]rank=newint[2];
publicintCompareTo(Suffixs)
{
if(rank[0]==s.rank[0])
{
returnrank[1].CompareTo(s.rank[1]);
}
else
{
returnrank[0].CompareTo(s.rank[0]);
}
}
}
classProgram
{
staticint[]buildSuffixArray(stringtxt,intn)
{
Suffix[]suffixes=newSuffix[n];
for(inti=0;i<n;i++)
{
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt[i]-'a';
suffixes[i].rank[1]=(i+1)<n?txt[i+1]-'a':-1;
}
// Sort the suffixes
Array.Sort(suffixes);
int[]ind=newint[n];
for(intk=4;k<2*n;k=k*2)
{
// Assigning rank and index values to first
// suffix
intrank=0;
intprevRank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;
for(inti=1;i<n;i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(suffixes[i].rank[0]==prevRank
&&suffixes[i].rank[1]==suffixes[i-1].rank[1])
{
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}
else
{
// Otherwise increment rank and assign
prevRank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}
for(inti=0;i<n;i++)
{
intnextIndex=suffixes[i].index+k/2;
suffixes[i].rank[1]=nextIndex<n?suffixes[ind[nextIndex]].rank[0]:-1;
}
Array.Sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int[]suffixArr=newint[n];
for(inti=0;i<n;i++)
{
suffixArr[i]=suffixes[i].index;
}
returnsuffixArr;
}
staticint[]Const_LCP(stringtxt,int[]suffixArr)
{
intn=suffixArr.Length;
int[]lcp=newint[n];
int[]invSuff=newint[n];
for(inti=0;i<n;i++)
{
invSuff[suffixArr[i]]=i;
}
intk=0;
for(inti=0;i<n;i++)
{
if(invSuff[i]==n-1)
{
k=0;
continue;
}
intj=suffixArr[invSuff[i]+1];
while(i+k<n&&j+k<n
&&txt[i+k]==txt[j+k])
{
k++;
}
lcp[invSuff[i]]=k;
if(k>0)
{
k--;
}
}
returnlcp;
}
staticintcnt_Dist_Substr(stringtxt)
{
intn=txt.Length;
// calculating suffix array and lcp array
int[]suffixArr=buildSuffixArray(txt,n);
int[]lcp=Const_LCP(txt,suffixArr);
// suffixes
intresult=n-suffixArr[0];
for(inti=1;i<lcp.Length;i++)
{
// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];
}
result++;// For empty string
returnresult;
}
staticvoidMain(){
Stringtxt="ababa";
Console.WriteLine(cnt_Dist_Substr(txt));
}
}
// The code is contributed by Arushi Goel.

JavaScript

// Javascript code to count total distinct substrings
// of a string

// This is the main function that takes a string
// 'txt' of size n as an argument, builds and return
// the suffix array for the given string
functionbuildSuffixArray(txt,n){

// Structure to store information of a suffix
classSuffix{
constructor(){
this.index=0;// To store original index
this.rank=[0,0];// To store ranks and next
// rank pair
}
}

// A comparison function used by sort() to compare
// two suffixes. Compares two pairs, returns 1 if
// first pair is smaller
functioncmp(a,b){
returna.rank[0]!==b.rank[0]
?a.rank[0]-b.rank[0]
:a.rank[1]-b.rank[1];
}


// A structure to store suffixes and their indexes
letsuffixes=newArray(n);

// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort
// the suffixes alphabetically and maintain their
// old indexes while sorting
for(leti=0;i<n;i++){
suffixes[i]=newSuffix();
suffixes[i].index=i;
suffixes[i].rank[0]=txt.charCodeAt(i)-"a".charCodeAt(0);
suffixes[i].rank[1]=
i+1<n?txt.charCodeAt(i+1)-"a".charCodeAt(0):-1;
}

// Sort the suffixes using the comparison function
// defined above.
suffixes.sort((a,b)=>cmp(a,b));


// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes
// according to first 4 characters, then first
// 8 and so on
letind=newArray(n);// This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.

for(letk=4;k<2*n;k*=2){
// Assigning rank and index values to first suffix
letrank=0;
letprev_rank=suffixes[0].rank[0];
suffixes[0].rank[0]=rank;
ind[suffixes[0].index]=0;

// Assigning rank to suffixes
for(leti=1;i<n;i++){
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if(
suffixes[i].rank[0]===prev_rank&&
suffixes[i].rank[1]===suffixes[i-1].rank[1]
){
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=rank;
}else// Otherwise increment rank and assign
{
prev_rank=suffixes[i].rank[0];
suffixes[i].rank[0]=++rank;
}
ind[suffixes[i].index]=i;
}

// Assign next rank to every suffix
for(leti=0;i<n;i++){
letnextindex=suffixes[i].index+k/2;
suffixes[i].rank[1]=
nextindex<n?suffixes[ind[nextindex]].rank[0]:-1;
}

// Sort the suffixes according to first k characters
suffixes.sort(cmp);
}
// Store indexes of all sorted suffixes in the suffix
// array
letsuffixArr=newArray(n);
for(leti=0;i<n;i++)suffixArr[i]=suffixes[i].index;
// Return the suffix array
returnsuffixArr;
}

/* To construct and return LCP */
functionkasai(txt,suffixArr){
letn=suffixArr.length;
// To store LCP array
letlcp=newArray(n).fill(0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the
// invSuff[5] would store 0. This is used to get next
// suffix string from suffix array.
letinvSuff=newArray(n).fill(0);

// Fill values in invSuff[]
for(leti=0;i<n;i++)invSuff[suffixArr[i]]=i;

letk=0;

// Process all suffixes one by one starting from
// first suffix in txt[]
for(leti=0;i<n;i++){
/* If the current suffix is at n-1, then we don’t
 have next substring to consider. So lcp is not
 defined for this substring, we put zero. */
if(invSuff[i]==n-1){
k=0;
continue;
}

/* j contains index of the next substring to
 be considered to compare with the present
 substring, i.e., next string in suffix array */
letj=suffixArr[invSuff[i]+1];

// Directly start matching from k'th index as
// at-least k-1 characters will match
while(i+k<n&&j+k<n&&txt[i+k]===txt[j+k])k++;

lcp[invSuff[i]]=k;// lcp for the present suffix.\

// Deleting the starting character from the string.
if(k>0)k--;
}

// return the constructed lcp array
returnlcp;
}

// method to return count of total distinct substring
functioncountDistinctSubstring(txt){
letn=txt.length;
// calculating suffix array and lcp array
letsuffixArr=buildSuffixArray(txt,n);
letlcp=kasai(txt,suffixArr);

// n - suffixArr[i] will be the length of suffix
// at ith position in suffix array initializing
// count with length of first suffix of sorted
// suffixes
letresult=n-suffixArr[0];

for(leti=1;i<lcp.length;i++)

// subtract lcp from the length of suffix
result+=(n-suffixArr[i])-lcp[i-1];

result++;// For empty string
returnresult;

}

// Driver code to test above methods
lettxt="ababa";
console.log(countDistinctSubstring(txt));

// This code is contributed by Utkarsh Kumar.