\$\begingroup\$
\$\endgroup\$
I have built a tiny script count the word frequencies of all text files with a specified directory. I would be glad to receive any suggestions and feedback.
import re, glob,os
from collections import OrderedDict, Counter
class CountWords:
def __init__(self,dir,ext):
self.dir = dir
self.ext = ext
@property
def files(self):
#collect all the files with `.txt` extension
os.chdir(self.dir)
return glob.glob(f"**/*.{self.ext}", recursive=True)
@property
def contents(self):
#concatenate files
cat_content = []
for file in self.files:
with open(file, 'r') as f:
content = f.read()
cat_content.append(content)
contents = ",".join(cat_content)
return contents
@property
def words(self):
words = re.findall(r"[a-zA-z_]+", self.contents)
words = [word.lower() for word in words]
return words
def count(self):
words_counter = Counter(self.words)
words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
return words_counter
The function version:
def count_words(dir,ext):
os.chdir(dir)
#collect all the files with `.txt` extension
files = glob.glob(f"**/*.ext", recursive=True)
#concatenate files
cat_content = []
for file in files:
with open(file, 'r') as f:
content = f.read()
cat_content.append(content)
contents = ",".join(cat_content)
#extract the words
words = re.findall(r"[a-zA-z_]+", contents)
words = [word.lower() for word in words]
words_counter = Counter(words)
words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
return words_counter
Jamal
35.2k13 gold badges134 silver badges238 bronze badges
1 Answer 1
\$\begingroup\$
\$\endgroup\$
#!/usr/bin/python3
import os
import re
from os.path import join
from collections import Counter, OrderedDict
def count_words(directory, ext): # don't use the name dir, it's a builtin function
"""Counts word frequencies in a directory of files.
Keyword arguments:
directory -- count_words will search this directory recursively
ext -- the extension of files that you wish to count
Returns an OrderedDict, from most to least frequent.
(You don't strictly need a docstring, but it's a good habit to get into.
See PEP 257.)"""
# Initialize the counter
word_counter = Counter()
# Personally I like to break my code into small, simple functions
# This code could be inline in the loop below,
# but I think it's a bit clearer this way.
def update_counter(word_counter, filename):
'''updates word_counter with all the words in open(filename)'''
with open(filename, 'r') as f:
try:
# Two things:
# First, I'm updating counter here,
# instead of creating an enormous string.
# Each file string only exists long enough for me to search it.
# Second and less important,
# I chose to lower the full text first, to make the regex simpler.
word_counter.update(re.findall('[a-z_]+', f.read().lower()))
except UnicodeDecodeError:
# In python3 if your file isn't UTF-8
# then read() throws an exception.
# Maybe you want this behavior,
# but I'd rather just warn and continue.
print("Warning: couldn't decode", filename)
# Using os.walk instead of glob
for root, dirs, files in os.walk(directory):
for fname in files:
if fname.endswith(ext):
update_counter(word_counter, join(root, fname))
# words_counter.most_common() does exactly the sort you are looking for
return OrderedDict(word_counter.most_common())
answered May 31, 2018 at 18:53
lang-py