Based on the reviewed code posted before at Multiple longest common subsequence (another algorithm) Multiple longest common subsequence (another algorithm)
Based on the reviewed code posted before at Multiple longest common subsequence (another algorithm)
Based on the reviewed code posted before at Multiple longest common subsequence (another algorithm)
Longest common subsequence (LCS) for multiple strings
How would you improve this code? Particularly the check and check_all functions?
The time complexity of the algorithm of mlcs is \$O(|\Sigma|MN)\$, where \$\Sigma\$ is the alphabet, M is the number of strings and N is the length of the strings. Is that right?
My analysis: candidates() performs \$O(|\Sigma|M)\$ operations and is called \$O(N)\$ times.
Based on the reviewed code posted before at Multiple longest common subsequence (another algorithm)
def check(string, seq):
i = 0
j = 0
while i < len(string) and j < len(seq):
if string[i] == seq[j]:
j += 1
i += 1
return len(seq) - j
def checkall(strings, seq):
for x in strings:
a = check(x, seq)
if not a == 0:
print(x, seq, a)
return False
return True
def mlcs(strings):
"""Return a long common subsequence of the strings.
"""
if not strings:
raise ValueError("mlcs() argument is an empty sequence")
strings = list(set(strings)) # deduplicate
alphabet = set.intersection(*(set(s) for s in strings))
# indexes[letter][i] is list of indexes of letter in strings[i].
indexes = {letter:[[] for _ in strings] for letter in alphabet}
for i, s in enumerate(strings):
for j, letter in enumerate(s):
if letter in alphabet:
indexes[letter][i].append(j)
# Generate candidate positions for next step in search.
def candidates():
for letter, letter_indexes in indexes.items():
candidate = []
for ind in letter_indexes:
if len(ind) < 1:
break
q = ind[0]
candidate.append(q)
else:
yield candidate
result = []
while True:
try:
# Choose the closest candidate position, if any.
pos = None
for c in candidates():
if not pos or sum(c) < sum(pos):
pos = c
letter = strings[0][pos[0]]
except TypeError:
return ''.join(result)
for let, letter_indexes in indexes.items():
for k, ind in enumerate(letter_indexes):
ind = [i for i in ind if i > pos[k]]
letter_indexes[k] = ind
result.append(letter)
strings = []
# Alphabet for the strings.
sigma = ["a", "b", "c", "d"]
# Alphabet for the LCS.
sigmax = ["e", "f", "g"]
import random
Nx = 67 # Length of LCS.
N = 128 # Length of strings. N >= Nx.
M = 128 # Number of strings.
x = ""
for _ in range(Nx):
x += random.choice(sigmax)
for _ in range(M):
string = ""
for _ in range(N):
string += random.choice(sigma)
indexes = list(range(N))
random.shuffle(indexes)
indexes = sorted(indexes[:len(x)])
for j in range(len(x)):
string = string[:indexes[j]]+x[j]+string[indexes[j]+1:]
strings += [string]
#strings = ["abbab", "ababa", "abbba"]
#strings = ["abab", "baba", "abaa"]
#strings = ["bacda", "abcde", "decac"]
#strings = ["babbabbb", "bbbaabaa", "abbbabab", "abbababa"]
#strings = ["ab", "aba"]
#print("Strings:")
#print(strings)
l = mlcs(strings)
print("LCS:")
print(l, len(l), checkall(strings, l))