\$\begingroup\$
\$\endgroup\$
2
Is the algorithm correct? It seems to work.
print("Multiple sequence longest common subsequence implementation in Python.\nFor example a naive algorithm would perfom c*128^128 elementary operations on a problem set of 128 sequences of length 128 elements.")
print(" That is: ", 128**128, "operations.")
from collections import defaultdict
import sys
def ptable(m, n, table):
for i in range(m):
line = ""
for j in range(n):
line += str(table[i, j]) + " "
print(line)
def lcs(x, y):
table = defaultdict(int)
n = len(x)
m = len(y)
for i in range(n):
for j in range(m):
if x[i] == y[j]:
table[i, j] = table[i-1, j-1]+1
else:
table[i, j] = max(table[i-1, j], table[i, j-1])
return table
def mlcs(strings):
strings = list(set(strings))
tables = dict()
for i in range(1, len(strings)):
x = strings[i]
for y in strings[:i]:
lcsxy = lcs(x, y)
tables[x,y] = lcsxy
def rec(indexes):
for v in indexes.values():
if v < 0:
return ""
same = True
for i in range(len(strings)-1):
x = strings[i]
y = strings[i+1]
same &= x[indexes[x]] == y[indexes[y]]
if same:
x = strings[0]
c = x[indexes[x]]
for x in strings:
indexes[x] -= 1
return rec(indexes) + c
else:
v = -1
ind = None
for x in strings:
indcopy = indexes.copy()
indcopy[x] -= 1
icv = valueat(indcopy)
if icv > v:
ind = indcopy
v = icv
indexes = ind
return rec(indexes)
def valueat(indexes):
m = 12777216
for i in range(1, len(strings)):
x = strings[i]
for y in strings[:i]:
lcsxy = tables[x,y]
p = lcsxy[indexes[x],indexes[y]]
m = min(m, p)
return m
indexes = dict()
for x in strings:
indexes[x] = len(x) - 1
return rec(indexes)
def check(string, seq):
i = 0
j = 0
while i < len(string) and j < len(seq):
if string[i] == seq[j]:
j += 1
i += 1
return len(seq) - j
def checkall(strings, seq):
for x in strings:
a = check(x, seq)
if not a == 0:
print(x, seq, a)
return False
return True
strings = []
sigma = ["a", "b", "c", "d"]
#sigma = ["a", "b"]
import random
for i in range(5):
string = ""
for j in range(128):
string += random.choice(sigma)
strings += [string]
#strings = ["abbab", "ababa", "abbba"]
#strings = ["abab", "baba", "abaa"]
#strings = ["bacda", "abcde", "decac"]
#strings = ["babbabbb", "bbbaabaa", "abbbabab", "abbababa"]
#strings = ["human", "chimpanzee", "woman"]
#strings = ["ababa", "babab", "bbbab"]
#strings = ["ababaaba", "abbabbaa", "aaaaabba"]
#strings = ["aabbbaba", "aaaabbba", "aabbbaab"]
print("Strings:", strings)
l = mlcs(strings)
print("Lcs:", l, checkall(strings, l))`
200_success
146k22 gold badges191 silver badges481 bronze badges
-
\$\begingroup\$ Is it working as intended or is it not working as intended? \$\endgroup\$Mast– Mast ♦2015年05月05日 15:42:49 +00:00Commented May 5, 2015 at 15:42
-
\$\begingroup\$ It seems to work as intended with the methods of testing provided. Maybe one needs to find larger test cases. \$\endgroup\$user72935– user729352015年05月05日 15:48:44 +00:00Commented May 5, 2015 at 15:48
1 Answer 1
\$\begingroup\$
\$\endgroup\$
2
This test case generation method produces cases where the result is sometimes sub-optimal. Thus there is a flaw in the algorithm.
strings = []
sigma = ["a", "b", "c", "d"]
#sigma = ["a", "b"]
sigmax = ["e", " f", "g"]
import random
Nx = 8
N = 16
M = 5 # Number of strings
x = ""
for i in range(Nx):
x += random.choice(sigmax)
for i in range(M):
string = ""
for j in range(N):
string += random.choice(sigma)
indexes = list(range(N))
random.shuffle(indexes)
indexes = sorted(indexes[:len(x)])
for j in range(len(x)):
string = string[:indexes[j]]+x[j]+string[indexes[j]:]
strings += [string]
So, back to the drawing board, so to speak.
200_success
146k22 gold badges191 silver badges481 bronze badges
-
\$\begingroup\$ Could you add either an example or an analysis? \$\endgroup\$200_success– 200_success2015年05月05日 19:42:50 +00:00Commented May 5, 2015 at 19:42
-
\$\begingroup\$ Added example generating code. The idea of the lcs algorithm is an extension of the 2-string lcs. The function rec does the backtracking and valueat is supposed to produce the correct value at a position of an M-dimensional space (M is the number of sequences). For this purpose the values at each of the "coordinate planes" (each a pairing of two strings) are produced using the regular 2-dim lcs algorithm. There are (m^2-m)/2 such pairings/planes. \$\endgroup\$user72935– user729352015年05月06日 03:51:00 +00:00Commented May 6, 2015 at 3:51
You must log in to answer this question.
lang-py