I am using the following script to fuzzy search almost similar book names for duplicate:
import re
from nltk.util import ngrams
OriginalBooksList = list()
booksAfterRemovingStopWords = list()
booksWithNGrams = list()
duplicatesSorted = list()
stopWords = ['I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the',
'and', 'A', 'About', 'An', 'Are', 'As', 'At', 'Be', 'By', 'Com', 'For', 'From', 'How', 'In', 'Is', 'It', 'Of', 'On', 'Or', 'That', 'The', 'This', 'To', 'Was', 'The', 'And']
with open('UnifiedBookList.txt') as fin:
for line_no, line in enumerate(fin):
OriginalBooksList.append(line)
line = re.sub(r'[^\w\s]', ' ', line) # replace punctuation with space
line = re.sub(' +', ' ', line) # replace multiple space with one
line = line.lower() # to lower case
if line.strip() and len(line.split()) > 2: # line can not be empty and line must have more than 2 words
booksAfterRemovingStopWords.append(' '.join([i for i in line.split(
) if i not in stopWords])) # Remove Stop Words And Make Sentence
for line_no, line in enumerate(booksAfterRemovingStopWords):
tokens = line.split(" ")
output = list(ngrams(tokens, 3))
temp = list()
temp.append(OriginalBooksList[line_no]) # Adding original line
for x in output: # Adding n-grams
temp.append(' '.join(x))
booksWithNGrams.append(temp)
while booksWithNGrams:
first_element = booksWithNGrams.pop(0)
x = 0
for mylist in booksWithNGrams:
if set(first_element) & set(mylist):
if x == 0:
duplicatesSorted.append(first_element[0])
x = 1
duplicatesSorted.append(mylist[0])
booksWithNGrams.remove(mylist)
x = 0
with open('DuplicatesSorted.txt', 'w') as f:
for item in duplicatesSorted:
f.write("%s\n" % item)
The input is:
A Course of Pure Mathematics by G. H. Hardy
Agile Software Development, Principles, Patterns, and Practices by Robert C. Martin
Advanced Programming in the UNIX Environment, 3rd Edition
Advanced Selling Strategies: Brian Tracy
Advanced Programming in the UNIX(R) Environment
Alex's Adventures in Numberland: Dispatches from the Wonderful World of Mathematics by Alex Bellos, Andy Riley
Advertising Secrets of the Written Word: The Ultimate Resource on How to Write Powerful Advertising
Agile Software Development, Principles, Patterns, and Practices
A Course of Pure Mathematics (Cambridge Mathematical Library) 10th Edition by G. H. Hardy
Alex’s Adventures in Numberland
Advertising Secrets of the Written Word
Alex's Adventures in Numberland Paperback by Alex Bellos
The output is:
A Course of Pure Mathematics by G. H. Hardy
A Course of Pure Mathematics (Cambridge Mathematical Library) 10th Edition by G. H. Hardy
Agile Software Development, Principles, Patterns, and Practices by Robert C. Martin
Agile Software Development, Principles, Patterns, and Practices
Advanced Programming in the UNIX Environment, 3rd Edition
Advanced Programming in the UNIX(R) Environment
Alex's Adventures in Numberland: Dispatches from the Wonderful World of Mathematics by Alex Bellos, Andy Riley
Alex’s Adventures in Numberland
Alex's Adventures in Numberland Paperback by Alex Bellos
Advertising Secrets of the Written Word: The Ultimate Resource on How to Write Powerful Advertising
Advertising Secrets of the Written Word
By looking at the script, it seems to me that I have over complicated things. Please give some suggestion on how I can make this code better.
2 Answers 2
Ok, I tried to rearrange it a bit:
- Corrected stop words (Should only contain lowercase words)
- Used Jaccard method to calculate distances
- Rearranged code structure
- Rewrote it in Python3 with type annotations
You should now add an argument parser and that's basically it.
As far as I understood the task, the final goal was to remove same books.
Now you can play with a threshold
argument to find out what strings should be treated the same.
import re
from typing import List, Callable, Set
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
def canonize(data: str) -> str:
data = re.sub(r'[^\w\s]', ' ', data) # replace punctuation with space
data = re.sub(' +', ' ', data) # replace multiple space with one
return data.lower().strip()
def jaccard(book_a: str, book_b: str, n: int = 3) -> float:
return 1 - jaccard_distance(set(ngrams(book_a, n)), set(ngrams(book_b, n)))
def filter_books(books: List[str],
book_filter_fun: Callable,
cmp_filter_func: Callable,
threshold: float = 0.3) -> Set[int]:
excluded_indices = set()
for one_book_offset, one_book in enumerate(books):
if book_filter_fun(one_book):
excluded_indices.add(one_book_offset)
for another_book_offset, another_book in enumerate(books[one_book_offset + 1:], start=one_book_offset + 1):
if {one_book_offset, another_book_offset} & excluded_indices:
continue
if cmp_filter_func(one_book, another_book) > threshold:
excluded_indices.add(one_book_offset)
return excluded_indices
if __name__ == '__main__':
stopWords = {'i', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it',
'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the'}
with open('UnifiedBookList.txt') as fin:
original_books = fin.readlines()
canonized_books = list(map(canonize, original_books))
excluded_indices = filter_books(
canonized_books,
lambda book: len(book.split()) < 2, # book name should contain not less than 2 words
jaccard,
)
with open('DuplicatesSorted.txt', 'w') as fout:
for i, book in enumerate(original_books):
if i in excluded_indices:
continue
fout.write(book)
-
\$\begingroup\$ Don't know if it got any easier XD \$\endgroup\$kupihleba– kupihleba2020年09月16日 19:30:27 +00:00Commented Sep 16, 2020 at 19:30
-
\$\begingroup\$ far better than what I have done. \$\endgroup\$Ahmad Ismail– Ahmad Ismail2020年09月16日 21:25:35 +00:00Commented Sep 16, 2020 at 21:25
From the code, it looks like the criteria for saying the books match is that they have at least one matching n-gram. Given that, the code can be simplified quite a bit.
Basically, build a data structure as the book data is read in line-by-line. Each entry has the book name and the set of n-grams.
Then look for intersecting n-grams. Keep track of items that are already matched up, so they aren't processed again.
NAME = 0
NGRAM = 1
NGRAMSIZE = 3
book_data = []
with io.StringIO('\n'.join(data)) as fin:
for line in fin:
line = line.strip()
words = re.findall(r'\w+', line.lower())
good_words = tuple(w for w in words if w not in stopwords)
n_grams = set(ngrams(good_words, NGRAMSIZE))
book_data.append((line, n_grams))
used_indices = set()
grouped_books = []
for index, (_, book_ngrams) in enumerate(book_data):
if index in used_indices:
continue
grouped_books.append(index)
used_indices.add(index)
for other_index, (_, other_ngrams) in enumerate(book_data[index + 1:], index + 1):
if book_ngrams & other_ngrams:
grouped_books.append(other_index)
used_indices.add(other_index)
for index in grouped_books:
print(f"{index:2} {book_data[index][NAME]}")
You could also consider using difflib
from the standard library. Here's some code to show how it might be used.
def isjunk(word): return word.lower() not in stopwords
matcher = dl.SequenceMatcher(isjunk=isjunk)
with open('datafile.txt') as f:
books = [line.lower()) for line in f]
titles = [re.findall(r'\w+', book) for book in books]
for i, seq2 in enumerate(titles):
print('\n', i, books[i], '\n')
matcher.set_seq2(seq2)
for j, seq1 in enumerate(titles[i+1:], i+1):
matcher.set_seq1(seq1)
score = matcher.ratio()
if score > 0.4:
print(f" {j:2} {score:4.2f} {books[j]}")