I have this code to create an inverted index from a directory of text files:
(def p (re-pattern #"[^\p{L}&&[^\p{M}]]"))
(defn invert[file]
(let [f (.getName file)
tokens (.split p (lower-case (slurp file)))]
(into {} (mapcat #(hash-map % #{f}) tokens))))
(defn build-index[dirname]
(reduce #(merge-with union %1 %2) (map invert (.listFiles (java.io.File. dirname)))))
Can you improve it / make it more idiomatic? My concern is performance. I'm testing it with 10k files of 10k in size, and I can make it about 20-30% faster if I use transients like this:
(defn add![idx file]
(let [f (.getName file)]
(loop [idx idx
tokens (.split p (lower-case (slurp file)))]
(if-not (seq tokens) idx
(recur (assoc! idx (first tokens) (union (idx (first tokens)) #{f})) (rest tokens))))))
(defn build-index[dirname]
(loop [files (.listFiles (java.io.File. dirname))
idx (transient {})]
(if-not (seq files) (persistent! idx)
(recur (rest files) (add! idx (first files))))))
Full code including test file generator here:
https://github.com/dbasch/closearch
Any feedback is welcome.
1 Answer 1
You should be able to use reducers to get a good speed boost as the consumption and building of your index can be parallelized.
I have found the best docs for reducers to be the doc strings themselves: Reducers -- Github
(ns ...
(:require [clojure.core.reducers :as r]
[clojure.java.io :as io]
[clojure.string :as str]))
(defn invert [file]
(let [f (.getName file)
tokens (str/split (str/lower-case (slurp file)) p)]
(->> tokens
(r/mapcat #(hash-map % #{f}))
(into {}))))
(defn build-index
[dirname]
(->> dirname
io/file
(.listFiles)
(r/map invert)
(r/fold (r/monoid (partial merge-with union) hash-map))))