group almost duplicate strings

Question 1

I am using the following script to fuzzy search almost similar book names for duplicate:

import re
from nltk.util import ngrams
OriginalBooksList = list()
booksAfterRemovingStopWords = list()
booksWithNGrams = list()
duplicatesSorted = list()
stopWords = ['I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the',
 'and', 'A', 'About', 'An', 'Are', 'As', 'At', 'Be', 'By', 'Com', 'For', 'From', 'How', 'In', 'Is', 'It', 'Of', 'On', 'Or', 'That', 'The', 'This', 'To', 'Was', 'The', 'And']
with open('UnifiedBookList.txt') as fin:
 for line_no, line in enumerate(fin):
 OriginalBooksList.append(line)
 line = re.sub(r'[^\w\s]', ' ', line) # replace punctuation with space
 line = re.sub(' +', ' ', line) # replace multiple space with one
 line = line.lower() # to lower case
 if line.strip() and len(line.split()) > 2: # line can not be empty and line must have more than 2 words
 booksAfterRemovingStopWords.append(' '.join([i for i in line.split(
 ) if i not in stopWords])) # Remove Stop Words And Make Sentence
for line_no, line in enumerate(booksAfterRemovingStopWords):
 tokens = line.split(" ")
 output = list(ngrams(tokens, 3))
 temp = list()
 temp.append(OriginalBooksList[line_no]) # Adding original line
 for x in output: # Adding n-grams
 temp.append(' '.join(x))
 booksWithNGrams.append(temp)
while booksWithNGrams:
 first_element = booksWithNGrams.pop(0)
 x = 0
 for mylist in booksWithNGrams:
 if set(first_element) & set(mylist):
 if x == 0:
 duplicatesSorted.append(first_element[0])
 x = 1
 duplicatesSorted.append(mylist[0])
 booksWithNGrams.remove(mylist)
 x = 0
with open('DuplicatesSorted.txt', 'w') as f:
 for item in duplicatesSorted:
 f.write("%s\n" % item)

The input is:

A Course of Pure Mathematics by G. H. Hardy
Agile Software Development, Principles, Patterns, and Practices by Robert C. Martin
Advanced Programming in the UNIX Environment, 3rd Edition
Advanced Selling Strategies: Brian Tracy
Advanced Programming in the UNIX(R) Environment
Alex's Adventures in Numberland: Dispatches from the Wonderful World of Mathematics by Alex Bellos, Andy Riley
Advertising Secrets of the Written Word: The Ultimate Resource on How to Write Powerful Advertising
Agile Software Development, Principles, Patterns, and Practices
A Course of Pure Mathematics (Cambridge Mathematical Library) 10th Edition by G. H. Hardy 
Alex’s Adventures in Numberland
Advertising Secrets of the Written Word
Alex's Adventures in Numberland Paperback by Alex Bellos

The output is:

A Course of Pure Mathematics by G. H. Hardy
A Course of Pure Mathematics (Cambridge Mathematical Library) 10th Edition by G. H. Hardy 
Agile Software Development, Principles, Patterns, and Practices by Robert C. Martin
Agile Software Development, Principles, Patterns, and Practices
Advanced Programming in the UNIX Environment, 3rd Edition
Advanced Programming in the UNIX(R) Environment
Alex's Adventures in Numberland: Dispatches from the Wonderful World of Mathematics by Alex Bellos, Andy Riley
Alex’s Adventures in Numberland
Alex's Adventures in Numberland Paperback by Alex Bellos
Advertising Secrets of the Written Word: The Ultimate Resource on How to Write Powerful Advertising
Advertising Secrets of the Written Word

By looking at the script, it seems to me that I have over complicated things. Please give some suggestion on how I can make this code better.

Question 2

Ok, I tried to rearrange it a bit:

Corrected stop words (Should only contain lowercase words)
Used Jaccard method to calculate distances
Rearranged code structure
Rewrote it in Python3 with type annotations

You should now add an argument parser and that's basically it.

As far as I understood the task, the final goal was to remove same books.

Now you can play with a threshold argument to find out what strings should be treated the same.

import re
from typing import List, Callable, Set
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
def canonize(data: str) -> str:
 data = re.sub(r'[^\w\s]', ' ', data) # replace punctuation with space
 data = re.sub(' +', ' ', data) # replace multiple space with one
 return data.lower().strip()
def jaccard(book_a: str, book_b: str, n: int = 3) -> float:
 return 1 - jaccard_distance(set(ngrams(book_a, n)), set(ngrams(book_b, n)))
def filter_books(books: List[str],
 book_filter_fun: Callable,
 cmp_filter_func: Callable,
 threshold: float = 0.3) -> Set[int]:
 excluded_indices = set()
 for one_book_offset, one_book in enumerate(books):
 if book_filter_fun(one_book):
 excluded_indices.add(one_book_offset)
 for another_book_offset, another_book in enumerate(books[one_book_offset + 1:], start=one_book_offset + 1):
 if {one_book_offset, another_book_offset} & excluded_indices:
 continue
 if cmp_filter_func(one_book, another_book) > threshold:
 excluded_indices.add(one_book_offset)
 return excluded_indices
if __name__ == '__main__':
 stopWords = {'i', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it',
 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the'}
 with open('UnifiedBookList.txt') as fin:
 original_books = fin.readlines()
 canonized_books = list(map(canonize, original_books))
 excluded_indices = filter_books(
 canonized_books,
 lambda book: len(book.split()) < 2, # book name should contain not less than 2 words
 jaccard,
 )
 with open('DuplicatesSorted.txt', 'w') as fout:
 for i, book in enumerate(original_books):
 if i in excluded_indices:
 continue
 fout.write(book)

Question 3

Don't know if it got any easier XD

Question 4

far better than what I have done.

Question 5

From the code, it looks like the criteria for saying the books match is that they have at least one matching n-gram. Given that, the code can be simplified quite a bit.

Basically, build a data structure as the book data is read in line-by-line. Each entry has the book name and the set of n-grams.

Then look for intersecting n-grams. Keep track of items that are already matched up, so they aren't processed again.

NAME = 0
NGRAM = 1
NGRAMSIZE = 3
book_data = []
with io.StringIO('\n'.join(data)) as fin:
 for line in fin:
 line = line.strip()
 words = re.findall(r'\w+', line.lower())
 good_words = tuple(w for w in words if w not in stopwords)
 n_grams = set(ngrams(good_words, NGRAMSIZE))
 
 book_data.append((line, n_grams))
used_indices = set()
grouped_books = []
for index, (_, book_ngrams) in enumerate(book_data):
 if index in used_indices:
 continue
 grouped_books.append(index)
 used_indices.add(index)
 
 for other_index, (_, other_ngrams) in enumerate(book_data[index + 1:], index + 1):
 if book_ngrams & other_ngrams:
 grouped_books.append(other_index)
 used_indices.add(other_index)
 
for index in grouped_books:
 print(f"{index:2} {book_data[index][NAME]}")

You could also consider using difflib from the standard library. Here's some code to show how it might be used.

def isjunk(word): return word.lower() not in stopwords

matcher = dl.SequenceMatcher(isjunk=isjunk)
with open('datafile.txt') as f:
 books = [line.lower()) for line in f] 
titles = [re.findall(r'\w+', book) for book in books]
for i, seq2 in enumerate(titles):
 
 print('\n', i, books[i], '\n')
 
 matcher.set_seq2(seq2)
 
 for j, seq1 in enumerate(titles[i+1:], i+1):
 matcher.set_seq1(seq1)
 
 score = matcher.ratio()
 if score > 0.4:
 print(f" {j:2} {score:4.2f} {books[j]}")

kupihleba kupihleba 1912 bronze badges · Accepted Answer · 2020-09-16 19:28:14Z

Ok, I tried to rearrange it a bit:

Corrected stop words (Should only contain lowercase words)
Used Jaccard method to calculate distances
Rearranged code structure
Rewrote it in Python3 with type annotations

You should now add an argument parser and that's basically it.

As far as I understood the task, the final goal was to remove same books.

Now you can play with a threshold argument to find out what strings should be treated the same.

import re
from typing import List, Callable, Set
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
def canonize(data: str) -> str:
 data = re.sub(r'[^\w\s]', ' ', data) # replace punctuation with space
 data = re.sub(' +', ' ', data) # replace multiple space with one
 return data.lower().strip()
def jaccard(book_a: str, book_b: str, n: int = 3) -> float:
 return 1 - jaccard_distance(set(ngrams(book_a, n)), set(ngrams(book_b, n)))
def filter_books(books: List[str],
 book_filter_fun: Callable,
 cmp_filter_func: Callable,
 threshold: float = 0.3) -> Set[int]:
 excluded_indices = set()
 for one_book_offset, one_book in enumerate(books):
 if book_filter_fun(one_book):
 excluded_indices.add(one_book_offset)
 for another_book_offset, another_book in enumerate(books[one_book_offset + 1:], start=one_book_offset + 1):
 if {one_book_offset, another_book_offset} & excluded_indices:
 continue
 if cmp_filter_func(one_book, another_book) > threshold:
 excluded_indices.add(one_book_offset)
 return excluded_indices
if __name__ == '__main__':
 stopWords = {'i', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it',
 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the'}
 with open('UnifiedBookList.txt') as fin:
 original_books = fin.readlines()
 canonized_books = list(map(canonize, original_books))
 excluded_indices = filter_books(
 canonized_books,
 lambda book: len(book.split()) < 2, # book name should contain not less than 2 words
 jaccard,
 )
 with open('DuplicatesSorted.txt', 'w') as fout:
 for i, book in enumerate(original_books):
 if i in excluded_indices:
 continue
 fout.write(book)

\$\begingroup\$ Don't know if it got any easier XD \$\endgroup\$

kupihleba
– kupihleba

2020年09月16日 19:30:27 +00:00
Commented Sep 16, 2020 at 19:30
\$\begingroup\$ far better than what I have done. \$\endgroup\$

Ahmad Ismail
– Ahmad Ismail

2020年09月16日 21:25:35 +00:00
Commented Sep 16, 2020 at 21:25

Stack Exchange Network

group almost duplicate strings

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

group almost duplicate strings

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions