Count the word frequencies within a given directory

Question 1

I have built a tiny script count the word frequencies of all text files with a specified directory. I would be glad to receive any suggestions and feedback.

import re, glob,os
from collections import OrderedDict, Counter
class CountWords:
 def __init__(self,dir,ext):
 self.dir = dir
 self.ext = ext
 @property
 def files(self):
 #collect all the files with `.txt` extension
 os.chdir(self.dir)
 return glob.glob(f"**/*.{self.ext}", recursive=True)
 @property
 def contents(self):
 #concatenate files
 cat_content = []
 for file in self.files:
 with open(file, 'r') as f:
 content = f.read()
 cat_content.append(content)
 contents = ",".join(cat_content)
 return contents
 @property
 def words(self):
 words = re.findall(r"[a-zA-z_]+", self.contents)
 words = [word.lower() for word in words]
 return words
 def count(self):
 words_counter = Counter(self.words)
 words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
 return words_counter

The function version:

def count_words(dir,ext):
 os.chdir(dir)
 #collect all the files with `.txt` extension
 files = glob.glob(f"**/*.ext", recursive=True)
 #concatenate files
 cat_content = []
 for file in files:
 with open(file, 'r') as f:
 content = f.read()
 cat_content.append(content)
 contents = ",".join(cat_content)
 #extract the words
 words = re.findall(r"[a-zA-z_]+", contents)
 words = [word.lower() for word in words]
 words_counter = Counter(words)
 words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
 return words_counter

Question 2

#!/usr/bin/python3
import os
import re
from os.path import join
from collections import Counter, OrderedDict
def count_words(directory, ext): # don't use the name dir, it's a builtin function
 """Counts word frequencies in a directory of files.
 Keyword arguments:
 directory -- count_words will search this directory recursively
 ext -- the extension of files that you wish to count
 Returns an OrderedDict, from most to least frequent.
 (You don't strictly need a docstring, but it's a good habit to get into.
 See PEP 257.)"""
 # Initialize the counter
 word_counter = Counter()
 # Personally I like to break my code into small, simple functions
 # This code could be inline in the loop below,
 # but I think it's a bit clearer this way.
 def update_counter(word_counter, filename):
 '''updates word_counter with all the words in open(filename)'''
 with open(filename, 'r') as f:
 try:
 # Two things:
 # First, I'm updating counter here,
 # instead of creating an enormous string.
 # Each file string only exists long enough for me to search it.
 # Second and less important,
 # I chose to lower the full text first, to make the regex simpler.
 word_counter.update(re.findall('[a-z_]+', f.read().lower()))
 except UnicodeDecodeError:
 # In python3 if your file isn't UTF-8
 # then read() throws an exception.
 # Maybe you want this behavior,
 # but I'd rather just warn and continue.
 print("Warning: couldn't decode", filename)
 # Using os.walk instead of glob
 for root, dirs, files in os.walk(directory):
 for fname in files:
 if fname.endswith(ext):
 update_counter(word_counter, join(root, fname))
 # words_counter.most_common() does exactly the sort you are looking for
 return OrderedDict(word_counter.most_common())

Thomas Nelson Thomas Nelson 1668 bronze badges · Accepted Answer · 2018-05-31 18:53:27Z

#!/usr/bin/python3
import os
import re
from os.path import join
from collections import Counter, OrderedDict
def count_words(directory, ext): # don't use the name dir, it's a builtin function
 """Counts word frequencies in a directory of files.
 Keyword arguments:
 directory -- count_words will search this directory recursively
 ext -- the extension of files that you wish to count
 Returns an OrderedDict, from most to least frequent.
 (You don't strictly need a docstring, but it's a good habit to get into.
 See PEP 257.)"""
 # Initialize the counter
 word_counter = Counter()
 # Personally I like to break my code into small, simple functions
 # This code could be inline in the loop below,
 # but I think it's a bit clearer this way.
 def update_counter(word_counter, filename):
 '''updates word_counter with all the words in open(filename)'''
 with open(filename, 'r') as f:
 try:
 # Two things:
 # First, I'm updating counter here,
 # instead of creating an enormous string.
 # Each file string only exists long enough for me to search it.
 # Second and less important,
 # I chose to lower the full text first, to make the regex simpler.
 word_counter.update(re.findall('[a-z_]+', f.read().lower()))
 except UnicodeDecodeError:
 # In python3 if your file isn't UTF-8
 # then read() throws an exception.
 # Maybe you want this behavior,
 # but I'd rather just warn and continue.
 print("Warning: couldn't decode", filename)
 # Using os.walk instead of glob
 for root, dirs, files in os.walk(directory):
 for fname in files:
 if fname.endswith(ext):
 update_counter(word_counter, join(root, fname))
 # words_counter.most_common() does exactly the sort you are looking for
 return OrderedDict(word_counter.most_common())

Stack Exchange Network

Count the word frequencies within a given directory

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Count the word frequencies within a given directory

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions