I'm posting my C++ code for LeetCode's Longest Duplicate Substring. If you have time and would like to review, please do so. Thank you!
Problem
Given a string S, consider all duplicated substrings: (contiguous) substrings of S that occur 2 or more times. (The occurrences may overlap.)
Return any duplicated substring that has the longest possible length. (If S does not have a duplicated substring, the answer is "".)
Example 1:
- Input: "banana"
- Output: "ana"
Example 2:
- Input: "abcd"
- Output: ""
Note:
- 2 <= S.length <= 10^5
- S consists of lowercase English letters.
Accepted C++
class Solution {
private:
const int prime = 19260817;
const int a_decimal = 65;
const int char_size = 26;
std::string res = "";
std::vector<int> exponent;
// Wikipedia
// The Rabin–Karp algorithm or Karp–Rabin algorithm is a string - searching algorithm that uses hashing to find an exact match of a pattern string in a text.
// It uses a rolling hash to quickly filter out positions of the text that cannot match the pattern,
// and then checks for a match at the remaining positions.
const std::string rabin_karp_search(const int length, const string& base) {
if (length == 0) {
return "";
}
std::unordered_map<int, vector<int>> hash_map = unordered_map<int, vector<int>>(); // hash memorization
long long curr = 0; // current hash
int index;
for (index = 0; index < length; index++) {
curr = ((curr * char_size) % prime + (base[index] - a_decimal)) % prime;
}
hash_map[curr] = std::vector<int>(1, 0);
for (index = length; index < base.length(); index++) {
curr = ((curr - (long long) exponent[length - 1] * (base[index - length] - a_decimal)) % prime + prime) % prime;
curr = (curr * char_size + (base[index] - a_decimal)) % prime;
if (hash_map.find(curr) == hash_map.end()) {
hash_map[curr] = std::vector<int>(1, -~index - length);
} else {
for (const auto iter : hash_map[curr]) {
if (std::strcmp((base.substr(iter, length)).data(), base.substr(-~index - length, length).data()) == 0) {
return base.substr(iter, length);
}
}
hash_map[curr].push_back(-~index - length);
}
}
return "";
}
// Wikipedia
// binary search is a search algorithm that finds the position of a target value within a sorted array.
// Binary search compares the target value to the middle element of the array.
// If they are not equal, the half in which the target cannot lie is eliminated and the search continues on the remaining half,
// again taking the middle element to compare to the target value, and repeating this until the target value is found.
// If the search ends with the remaining half being empty, the target is not in the array.
const std::string get_longest_binary_search(std::string base_string, std::string res) {
int lo = 0;
int hi = base_string.length();
while (lo <= hi) {
int mid = lo + ((hi - lo) >> 1);
std::string temp = rabin_karp_search(mid, base_string);
if (temp.length() == 0) {
hi = mid - 1;
} else {
if (temp.length() > res.length()) {
res = temp;
}
lo = -~mid;
}
}
return res;
}
public:
const std::string longestDupSubstring(const std::string base_string) {
res = "";
exponent = std::vector<int>(base_string.length(), 1);
int index;
for (index = 1; index < base_string.length(); index++) {
exponent[index] = (exponent[index - 1] * char_size) % prime;
}
return get_longest_binary_search(base_string, res);
}
};
LeetCode Solution in Java with additional comments (Not for review)
class Solution {
/*
Rabin-Karp with polynomial rolling hash.
Search a substring of given length
that occurs at least 2 times.
Return start position if the substring exits and -1 otherwise.
*/
public int search(int L, int a, long modulus, int n, int[] nums) {
// compute the hash of string S[:L]
long h = 0;
for(int i = 0; i < L; ++i) h = (h * a + nums[i]) % modulus;
// already seen hashes of strings of length L
HashSet<Long> seen = new HashSet();
seen.add(h);
// const value to be used often : a**L % modulus
long aL = 1;
for (int i = 1; i <= L; ++i) aL = (aL * a) % modulus;
for(int start = 1; start < n - L + 1; ++start) {
// compute rolling hash in O(1) time
h = (h * a - nums[start - 1] * aL % modulus + modulus) % modulus;
h = (h + nums[start + L - 1]) % modulus;
if (seen.contains(h)) return start;
seen.add(h);
}
return -1;
}
public String longestDupSubstring(String S) {
int n = S.length();
// convert string to array of integers
// to implement constant time slice
int[] nums = new int[n];
for(int i = 0; i < n; ++i) nums[i] = (int)S.charAt(i) - (int)'a';
// base value for the rolling hash function
int a = 26;
// modulus value for the rolling hash function to avoid overflow
long modulus = (long)Math.pow(2, 32);
// binary search, L = repeating string length
int left = 1, right = n;
int L;
while (left <= right) {
L = left + (right - left) / 2;
if (search(L, a, modulus, n, nums) != -1) left = L + 1;
else right = L - 1;
}
int start = search(left - 1, a, modulus, n, nums);
return S.substring(start, start + left - 1);
}
}
Reference
LeetCode is a platform only for interviewing and competitive programming. On LeetCode, there is a class usually named Solution
with one or more public
functions which we are not allowed to rename.
2 Answers 2
Avoid unnecessary member variables
You added res
and exponent
as member variables. However, they are only used inside longestDupSubString()
and functions called by it. You should just declare them inside longestDupSubString()
instead, and pass them by reference to other functions if necessary. But see below for why these variables might not be necessary at all.
Use character constants
Write const int a_decimal = 'a'
, so there is no need to know the ASCII table and no possibility for errors. However, then the question is, why define a_decimal
at all? It seems you want to force integer promotion, but you can make that more explicit. Instead of base[index] - a_decimal
, you can write (int)base[index] - 'a'
.
But this makes me wonder, why subtract 'a'
at all? Sure, the question says the input consists of only lowercase English numbers, but you can keep your solution generic.
Don't return const
values
There is no point in returning something by const
value. The following is perfectly valid:
const std::string foo() {
return "foo";
}
std::string bar = foo();
It only makes sense to make the return type const
if you are return a pointer or reference.
Avoid using namespace std
and/or #include <bits/stdc++.h>
I see you forgot to add std::
to some standard library types, implying that you have using namespace std
somewhere or are using the non-standard #include <bits/stdc++.h>
.
Give proper names to variables
Some of your naming choices are questionable:
char_size
: sounds like it would hold the result ofsizeof(char)
, but unstead it's the number of letters in the alphabet. Maybealphabet_size
would be better.hash_map
: the name is equivalent to the type (std::unordered_map
), but what you should have used is something that represents what information the hash map holds: substrings that you already visited. So maybevisited_substrings
is a better name.index
: this is one of the few times you can use a one-letter variable, likei
, since that is the idiomatic name for a loop counter in C++.iter
: infor(const auto iter: hash_map[curr])
, the variableiter
is not an iterator, but actually holds the value of one of the elements of astd::vector<int>
. Soelement
,item
orentry
would already be a better name, but even better is a name that reflects what that elements represents, namely an offset into the base string, sooffset
would be a good name here.
Your hash function can have collisions, and is unnecessary
Your hash function can have collisions if you ever have substrings longer than 32 / log2(26) = 6 characters. A collision would not be a problem if you would handle them, but you don't. Also, there is no need to create a hash yourself, since std::unordered_map
already does that for you! Just pass the substring to it directly:
std::unordered_map<std::string, std::vector<int>> visited_substrings;
auto substring = base.substr(0, length);
visited_substrings[substring] = {0};
Avoid repeating type names
There are a few places where you can avoid repeating type names. As shown above, when declaring a variable of type std::unordered_map
, it is already initialized to be an empty map, so no need to explicity initialize it with another empty map.
When assigning to an element of a std::unordered_map
, you can use an initializer list, and since the compiler knows the type of the map elements, you don't have to repeat that yourself. So visited_substrings[substring] = {0}
will initialize the vector with one integer with value 0
.
Don't use C library functions if there are perfectly fine C++ equivalents
When comparing C++ strings, don't use strcmp()
, but rather use the tools the std::string
class provides you. In particular, you can just use the ==
operator:
if (base.substr(offset, length) == base.substr(index + 1 - length, length)) {
return base.substr(offset, length);
}
Also, std::string
comes with the member function compare()
that can compare substrings directly:
if (base.compare(offset, length, base, index + 1 - length, length) == 0) {
return base.substr(offset, length);
}
Although it doesn't look like much of an improvement, it avoids having to create new temporary strings to hold the substrings.
Don't use bit twiddling tricks unnecessarily
There is no need to write -~index
when you can just write index + 1
. The latter is much clearer. Also, -~index
being equivalent to index + 1
assumes two's complement representation of integers, which is not guaranteed in C++17 (it is only since C++20).
Also, in int mid = lo + ((hi - lo) >> 1)
, just write int mid = lo + (hi - lo) / 2
, it is much clearer what the intention is. If you could use C++20, then you should use std::midpoint()
here, since there are many pitfalls in your simple approach, although it works fine in the constraints of this LeetCode problem.
Use unsigned integers where appropriate
For array indices, sizes and non-negative offsets, you should unsigned integers, or even better size_t
. There are several reasons for this:
- There's less chance of overflow. Note that unintended overlow might be a security issue.
- When using unsigned integers as function parameters, you never have to check whether they are non-negative if that is not allowed.
- There are less surprises when doing bitwise operations on unsigned integers.
- Some common standard library functions, such as
std::string::size()
, also return unsigned integers, so you won't get warnings about comparing signed to unsigned numbers.
Regarding that last point, ensure you have compiler warnings enabled and fix all warnings it produces.
-
1\$\begingroup\$ Please add indexes to the list of where unsigned integers are appropriate. It might be good to explain why unsigned is better (integer overflow comes to mind). \$\endgroup\$2020年06月20日 11:55:58 +00:00Commented Jun 20, 2020 at 11:55
-
1\$\begingroup\$ Wow, that's a pretty long video, but it's worth viewing every minute of it. \$\endgroup\$Roland Illig– Roland Illig2020年06月20日 12:39:42 +00:00Commented Jun 20, 2020 at 12:39
G. Sliepen wrote a rather comprehensive review, I'm going to expand on one point in their review, and add 2 others.
Avoid using namespace std and/or #include <bits/stdc++.h>
I see you forgot to add std:: to some standard library types, implying that you have using namespace std somewhere or are using the non-standard #include <bits/stdc++.h>.
The LeetCode is doing this for you and it is promoting bad habits that you need to unlearn. The proper includes for this code are
#include <vector>
#include <string>
#include <unordered_map>
If you are coding professionally you probably should get out of the habit of using the using namespace std;
statement. The code will more clearly define where cout
and other identifiers are coming from (std::cin
, std::cout
). As you start using namespaces in your code it is better to identify where each function comes from because there may be function name collisions from different namespaces. The identifiercout
you may override within your own classes, and you may override the operator <<
in your own classes as well. This stack overflow question discusses this in more detail.
More on Private, Protected and Public
I see that you do learn from your previous reviews and that is a good thing. In the following code the keyword private:
is not necessary, when you first open a class declaration and variables, methods and functions are private by default.
class Solution {
private:
const int prime = 19260817;
const int a_decimal = 65;
const int char_size = 26;
std::string res = "";
std::vector<int> exponent;
You will find that a lot of C++ programmers are no long using the first section of a class declaration because it is better to put the public interfaces at the beginning of a class so that users of that class can find the public interfaces easily. This actually applies to most object oriented programming languages. The general hierarchy is public first, then protected and then private.
Class File Structure
My concern here is that you're only learning C++ through LeetCode
that you are learning some bad habits that will need to be replaced at some point. C++ is generally broken up into header files and source files. You are fairly familiar with the header file grammar but you are not familiar with the source file grammar.
Historically the C++ programming language grew out of the C programming language which already had separate header files and source files. Unlike Java and C# most of the member functions and methods have function prototypes in the class declaration and the actual functions are defined in a .cpp
file. There are a couple of reasons for this, the first is that it allows bugs to be fixed in the code while not affecting the public interfaces. This means that patches or updated dynamically linked libraries can be developed and shipped to fix bugs without redoing the entire application. The other reason is that compile / build times are improved by reducing the contents of the header files.
There are 2 exceptions to this,
- For performance reasons if a function or method is not very complex it can be included in the header so that the compiler can try to
inline
it. This means that the code of the function will replace the function call where it is used. - There are complete libraries such as the Boost Library in
.hpp
files that provide a great deal of additional functionality (maybe even a binary search).
This is what the solutions class might look like in this case:
Solution.h
#ifndef LEETCODE1044_SOLUTION_H
#define LEETCODE1044_SOLUTION_H
#include <vector>
#include <string>
#include <unordered_map>
class Solution {
private:
const int prime = 19260817;
const int a_decimal = 65;
const int char_size = 26;
std::string res = "";
std::vector<int> exponent;
// Wikipedia
// The Rabin–Karp algorithm or Karp–Rabin algorithm is a string - searching algorithm that uses hashing to find an exact match of a pattern string in a text.
// It uses a rolling hash to quickly filter out positions of the text that cannot match the pattern,
// and then checks for a match at the remaining positions.
const std::string rabin_karp_search(const int length, const std::string& base);
// Wikipedia
// binary search is a search algorithm that finds the position of a target value within a sorted array.
// Binary search compares the target value to the middle element of the array.
// If they are not equal, the half in which the target cannot lie is eliminated and the search continues on the remaining half,
// again taking the middle element to compare to the target value, and repeating this until the target value is found.
// If the search ends with the remaining half being empty, the target is not in the array.
const std::string get_longest_binary_search(std::string base_string, std::string res);
public:
const std::string longestDupSubstring(const std::string base_string);
};
#endif //LEETCODE1044_SOLUTION_H
Solution.cpp
#include "Solution.h"
const std::string Solution::rabin_karp_search(const int length, const std::string &base)
{
if (length == 0) {
return "";
}
std::unordered_map<int, std::vector<int>> hash_map = std::unordered_map<int, std::vector<int>>(); // hash memorization
long long curr = 0; // current hash
int index;
for (index = 0; index < length; index++) {
curr = ((curr * char_size) % prime + (base[index] - a_decimal)) % prime;
}
hash_map[curr] = std::vector<int>(1, 0);
for (index = length; index < base.length(); index++) {
curr = ((curr - (long long) exponent[length - 1] * (base[index - length] - a_decimal)) % prime + prime) % prime;
curr = (curr * char_size + (base[index] - a_decimal)) % prime;
if (hash_map.find(curr) == hash_map.end()) {
hash_map[curr] = std::vector<int>(1, -~index - length);
} else {
for (const auto iter : hash_map[curr]) {
if (std::strcmp((base.substr(iter, length)).data(), base.substr(-~index - length, length).data()) == 0) {
return base.substr(iter, length);
}
}
hash_map[curr].push_back(-~index - length);
}
}
return "";
}
const std::string Solution::get_longest_binary_search(std::string base_string, std::string res)
{
int lo = 0;
int hi = base_string.length();
while (lo <= hi) {
int mid = lo + ((hi - lo) >> 1);
std::string temp = rabin_karp_search(mid, base_string);
if (temp.length() == 0) {
hi = mid - 1;
} else {
if (temp.length() > res.length()) {
res = temp;
}
lo = -~mid;
}
}
return res;
}
const std::string Solution::longestDupSubstring(const std::string base_string)
{
res = "";
exponent = std::vector<int>(base_string.length(), 1);
int index;
for (index = 1; index < base_string.length(); index++) {
exponent[index] = (exponent[index - 1] * char_size) % prime;
}
return get_longest_binary_search(base_string, res);
}
-
1\$\begingroup\$ Good points about bad habits taught by LeetCode. Another one is that a
class
is used at all. That might be necessary in Java, but in C++longestDupSubString()
could just be a stand-alone function. \$\endgroup\$G. Sliepen– G. Sliepen2020年06月20日 13:15:44 +00:00Commented Jun 20, 2020 at 13:15 -
1\$\begingroup\$ @G.Sliepen You're correct, it is forcing the usage of C++ as an object oriented language when it doesn't need to, but it also encapsulates all the code (any supporting functions). \$\endgroup\$2020年06月20日 13:24:07 +00:00Commented Jun 20, 2020 at 13:24
Explore related questions
See similar questions with these tags.