1
\$\begingroup\$

There are two versions of the CSV parser. The latter seems to be more performant than the former, although the former's code is easier to read.

Variant 1, which parses a string:

(defn newline?
 [[a b]]
 (or (= a \newline) (and (= a \return) (= b \newline))))
(defn parse-value-raw
 [s]
 (loop [[a & rest :as all] s
 res ""]
 (cond
 (empty? all) ["" all]
 (or (= a ,円) (newline? all)) [res all]
 :else (recur rest (str res a)))))
(defn parse-value-quoted
 [[q & val]]
 (loop [[a & tail] val
 res ""]
 (cond
 (and (= a \") (= (first tail) \")) (recur (rest tail) (str res \"))
 (= a q) [res tail]
 :else (recur tail (str res a)))))
(defn parse-value
 [s]
 (if (= (first s) \")
 (parse-value-quoted s)
 (parse-value-raw s)))
(defn parse-record
 [s]
 (loop [records []
 [v tail] (parse-value s)]
 (cond
 (= (first tail) ,円) (recur (conj records v) (parse-value (rest tail)))
 (or (newline? tail) (empty? tail)) [(conj records v) tail]
 :else [(conj records v) :error])))
(defn parse-csv
 [s]
 (loop [rows []
 [rec tail] (parse-record s)]
 (cond
 (empty? tail) (conj rows rec)
 (newline? tail) (recur (conj rows rec) (parse-record (rest tail)))
 :else (conj rows rec))))

Variant 2, which uses java.io.PushbackReader and StringBuilder:

(defn create-reader
 [^String path]
 (java.io.PushbackReader. (java.io.BufferedReader. (java.io.FileReader. path)) 2))
(defn line-break?
 [c ^java.io.PushbackReader r]
 (cond
 (= c \newline) true
 (= c \return) (let [nx (.read r) nxc (char nx)]
 (if (= nxc \newline) true (do (.unread r nx) false))
 )
 :else false))
(defn read-next
 [^java.io.PushbackReader r]
 (let [cd (.read r)]
 (cond
 (= cd -1) [:eof]
 (= (char cd) \") (let [nx (.read r)]
 (if (= (char nx) \")
 [:escaped-quote (char nx)]
 (do (.unread r nx) [:char \"])))
 :else [:char (char cd)])))
(defn parse-value
 [^java.io.PushbackReader r]
 (let [[t first] (read-next r)
 is-quoted (and (not= t :eof) (= (char first) \"))
 sb (StringBuilder.)]
 (when (and (not= t :eof) (not is-quoted)) (.append sb first))
 (loop [closed false]
 (let [[ctype c] (read-next r)]
 (if is-quoted
 (cond
 closed (cond
 (= ctype :eof) [:eof (.toString sb)]
 (= c ,円) [:separator (.toString sb)]
 (line-break? c r) [:linebreak (.toString sb)])
 (= ctype :eof) [:error (.toString sb)]
 (= ctype :escaped-quote) (do (.append sb c) (recur false))
 (= c first) (recur true)
 :else (do (.append sb c) (recur false)))
 (cond
 (= ctype :eof) [:eof (.toString sb)]
 (= ctype :escaped-quote) (do (.append sb "\"\"") (recur false))
 (= c ,円) [:separator (.toString sb)]
 (line-break? c r) [:linebreak (.toString sb)]
 :else (do (.append sb c) (recur false))))))))
(defn parse-csv
 [^java.io.PushbackReader r]
 (loop [records []]
 (let [
 [st rec] (loop [record []]
 (let [[state v] (parse-value r)]
 (cond
 (= state :error) [state (conj record v)]
 (= state :eof) [state (conj record v)]
 (= state :linebreak) [state (conj record v)]
 :else (recur (conj record v)))))]
 (cond
 (= st :error) (conj records rec)
 (= st :eof) (conj records rec)
 (= st :linebreak) (recur (conj records rec)))
 )
 )
 )

Please review both of them and tell me how to improve them.

toolic
15.8k6 gold badges29 silver badges217 bronze badges
asked Oct 14 at 11:27
\$\endgroup\$
1
  • \$\begingroup\$ "more performant" -- show us an example .csv workload you ran each one against, along with timings please. \$\endgroup\$ Commented Oct 15 at 15:46

0

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.