As part of a larger project to generate a website with a personal list of publications, I implemented a parser for BibTeX files.
The entry point is the parseFile
method in PublicationListParser
. This method scans the file for entries (and tags - a custom extension specific to my project). Each entry is read into a String
by Tokenizer
, and then parsed by BibItemParser
.
I'm looking for all kinds of feedback: style issues, missed corner cases, performance, logical organization, etc.
PublicationListParser.java
package publy.io.bibtexparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import publy.Console;
import publy.data.Author;
import publy.data.bibitem.BibItem;
public class PublicationListParser {
public static List<BibItem> parseFile(Path file) throws IOException, ParseException {
Console.debug("Parsing publication list \"%s\"", file);
PublicationListParser parser = new PublicationListParser();
parser.parseFileInternal(file);
AbbreviationHandler.handleAbbreviationsAndAuthors(parser.items, parser.abbreviations, parser.authors);
return parser.items;
}
private final List<BibItem> items = new ArrayList<>();
private final Map<String, String> abbreviations = new HashMap<>();
private final Map<String, Author> authors = new HashMap<>();
private PublicationListParser() {
}
private void parseFileInternal(Path file) throws IOException, ParseException {
try (BufferedReader in = Files.newBufferedReader(file, Charset.forName("UTF-8"))) {
for (String l = in.readLine(); l != null; l = in.readLine()) {
String line = l.trim();
if (line.startsWith("@")) {
// A Bibitem
BibItem item = BibItemParser.parseBibItem(Tokenizer.collectBibItem(in, line).replaceAll("\\s+", " "));
if (item != null) {
switch (item.getType()) {
case COMMENT:
case PREAMBLE:
break; // Ignore
case STRING:
// Add to abbreviations
abbreviations.put(item.get("short"), item.get("full"));
break;
default:
items.add(item);
}
}
} else if (line.startsWith("<")) {
// A custom tag
Tag tag = TagParser.parseTag(Tokenizer.collectTag(in, line).replaceAll("\\s+", " "));
if (tag.type == Tag.Type.ABBREVIATION) {
abbreviations.put(tag.values.get("short"), tag.values.get("full"));
} else if (tag.type == Tag.Type.AUTHOR) {
authors.put(tag.values.get("short"), tag.toAuthor());
} else {
throw new InternalError("Tag with unexpected type: " + tag);
}
}
}
}
}
}
Tokenizer.java
package publy.io.bibtexparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import publy.data.Pair;
public class Tokenizer {
public static String collectBibItem(BufferedReader input, String firstLine) throws IOException, ParseException {
CombinedReader in = new CombinedReader(input, firstLine);
StringBuilder bibitem = new StringBuilder();
// Test for starting with '@'
int c = in.read();
if ((char) c != '@') {
throw new ParseException("First character of bibitem should be '@'.");
}
// Scan for first open brace ('{')
bibitem.appendCodePoint(c);
c = in.read();
while (c != -1 && (char) c != '{') {
bibitem.appendCodePoint(c);
c = in.read();
}
if (c == -1) {
throw new ParseException("No opening brace found when trying to parse bibitem.");
} else {
bibitem.appendCodePoint(c);
}
// Collect the body
collectMatchedToken(in, '{', '}', bibitem);
return bibitem.toString();
}
public static String collectTag(BufferedReader input, String firstLine) throws IOException, ParseException {
CombinedReader in = new CombinedReader(input, firstLine);
StringBuilder tag = new StringBuilder();
// Test for starting with '<'
int c = in.read();
if ((char) c != '<') {
throw new IOException("First character of tag should be '<'.");
}
tag.appendCodePoint(c);
// Collect the body
collectMatchedToken(in, '<', '>', tag);
return tag.toString();
}
public static Pair<String, String> collectValue(String body) throws ParseException {
// Collect until first "level-0" comma or close brace (end of bibitem)
// When encountering an open brace, collect until we've matched it
// When encountering a quote ("), collect until next quote
int braceLevel = 0;
boolean inQuotes = false;
for (int i = 0; i < body.length(); i++) {
int c = body.codePointAt(i);
// Check braces
if ((char) c == '{') {
braceLevel++;
} else if (braceLevel > 0 && (char) c == '}') {
braceLevel--;
} else if (braceLevel == 0) {
// Check quotes
if ((char) c == '"') {
inQuotes = !inQuotes;
} else if (!inQuotes) {
if ((char) c == ',' || (char) c == '}') {
// zero-level end-of-value: we're done!
return new Pair<>(body.substring(0, i), body.substring(i));
}
}
}
}
throw new ParseException(String.format("End of input reached while collecting value.%nText: %s", body));
}
/**
* Collects characters from the input stream until the first time the number
* of close characters seen is larger than the number of open characters.
*
* @param in
* @param open
* @param close
* @return
*/
private static void collectMatchedToken(CombinedReader in, char open, char close, StringBuilder result) throws ParseException, IOException {
int openCount = 1;
while (openCount > 0) {
int c = in.read();
if (c == -1) {
if (open == '{') {
throw new ParseException("End of input reached while trying to match braces in bibitem body.");
} else if (open == '<') {
throw new ParseException("End of input reached while trying to match angle brackets in tag body.");
} else {
throw new ParseException("End of input reached while trying to match.");
}
}
result.appendCodePoint(c);
if ((char) c == open) {
openCount++;
} else if ((char) c == close) {
openCount--;
}
}
}
private static class CombinedReader {
boolean endOfString = false;
StringReader sr;
BufferedReader br;
public CombinedReader(BufferedReader br, String s) {
this.sr = new StringReader(s);
this.br = br;
}
public int read() throws IOException {
if (endOfString) {
return br.read();
} else {
int c = sr.read();
if (c == -1) {
endOfString = true;
return br.read();
} else {
return c;
}
}
}
}
private Tokenizer() {
}
}
BibItemParser.java
package publy.io.bibtexparser;
import java.io.IOException;
import publy.Console;
import publy.data.Pair;
import publy.data.bibitem.BibItem;
public class BibItemParser {
public static BibItem parseBibItem(String text) throws IOException, ParseException {
int bodyStart = text.indexOf('{');
String type = text.substring(1, bodyStart).trim().toLowerCase();
String body = text.substring(bodyStart + 1).trim();
switch (type) {
case "comment":
case "preamble":
return new BibItem(type, null); // Ignore contents
case "string":
return parseString(body);
default:
return parsePublication(type, body);
}
}
private static BibItem parseString(String body) {
// Syntax: Short = "Full" or Short = {Full}
int split = body.indexOf('=');
String shortName = body.substring(0, split).trim();
String fullText = body.substring(split + 1, body.length() - 1).trim(); // Remove outer '}'
fullText = fullText.substring(1, fullText.length() - 1); // Remove outer pair of braces or quotation marks
BibItem result = new BibItem("string", null);
result.put("short", shortName);
result.put("full", fullText);
return result;
}
private static BibItem parsePublication(String type, String body) throws ParseException {
// Syntax: id, (field-value-pair)*
int idEnd = body.indexOf(',');
if (idEnd == -1) {
// No fields
return new BibItem(type, body.substring(0, body.length() - 1));
}
String id = body.substring(0, idEnd).trim();
body = body.substring(idEnd + 1).trim();
BibItem result = new BibItem(type, id);
while (!body.isEmpty() && !body.equals("}")) {
// Parse the next field-value pair
int valueStart = body.indexOf('=');
if (valueStart == -1) {
// No more field-value pairs, but text left: warn
System.err.printf("After parsing all fields of publication \"%s\", the following text was left and not part of any field:\n%s\n", id, body);
Console.warn(Console.WarningType.OTHER, "After parsing all fields of publication \"%s\", the following text was left and not part of any field:\n%s\n", id, body);
break;
}
String field = body.substring(0, valueStart).trim().toLowerCase();
body = body.substring(valueStart + 1).trim();
Pair<String, String> value = Tokenizer.collectValue(body);
result.put(field, parseValue(value.getFirst()));
body = value.getSecond().trim();
if (body.startsWith(",")) {
body = body.substring(1).trim();
}
}
return result;
}
public static String parseValue(String text) {
// Drop outer pair of separators (braces or quotes)
// Turn @string abbreviations into publy abbreviations ("<<short>>")
// Process string concatenation
StringBuilder result = new StringBuilder();
int braceLevel = 0;
boolean inQuotes = false;
boolean inAbbreviation = false;
for (int i = 0; i < text.length(); i++) {
int c = text.codePointAt(i);
if (braceLevel > 0) {
if ((char) c == '{') {
braceLevel++;
} else if ((char) c == '}') {
braceLevel--;
}
if (braceLevel > 0 || inQuotes) {
// Add everything but the closing brace or quote
result.appendCodePoint(c);
}
} else if (inQuotes) {
if ((char) c == '"') {
inQuotes = false;
} else {
result.appendCodePoint(c);
if ((char) c == '{') {
braceLevel++;
} else if (braceLevel > 0 && (char) c == '}') {
braceLevel--;
}
}
} else if (inAbbreviation) {
if (Character.isWhitespace(c) || (char) c == '#' || (char) c == '{' || (char) c == '"') {
// End of abbreviation
result.append(">>");
inAbbreviation = false;
if ((char) c == '{') {
braceLevel = 1;
} else if ((char) c == '"') {
inQuotes = true;
}
} else {
result.appendCodePoint(c);
}
} else {
// Brace or quote start new tokens, pound is ignored, numbers just get parsed, text starts a new abbreviation token
if ((char) c == '{') {
braceLevel = 1;
} else if ((char) c == '"') {
inQuotes = true;
} else if (Character.isDigit(c)) {
result.appendCodePoint(c);
} else if (Character.isAlphabetic(c)) {
result.append("<<");
result.appendCodePoint(c);
inAbbreviation = true;
} // else ignore
}
}
if (inAbbreviation) {
result.append(">>");
}
return result.toString();
}
private BibItemParser() {
}
}
Here is a link to the full source code for the project on BitBucket.
2 Answers 2
.substring()
Note that .substring() values are not shared any more with the string
they were created from (link).
That means calls like body = body.substring(...).trim()
will allocate two new strings: one for the .substring(...)
and another
for the .trim()
which is another form of a sub-string.
So if you are concerned about efficiency, avoid marching through a string by repeatedly taking substrings.
use a library
I undersand this is Code Review, but since these seems to be a real project, consider using a third party library like jbibtex. jbibtex is distributed under a liberal license and looks like a very robust BiBTeX fle parser.
use a tokenizer
You use several different methods for consuming input:
- reading a file line by line -
in.readline()
- reading it one character at a time -
in.read()
- indexing into a string, e.g.
body.codePointAt(i)
- searching through a string. e.g.
body.indexOf(',')
followed bybody = body.substring(...)
.
I think you could make your life easier by tokenizing the
input and parse the file at the token level.
This would, for instance, get rid of the multiple .trim()
calls
all over the place since ignoring white space would be
the job of the tokenizer and thus handled in one place.
An example of a tokenizer is the class java.io.StreamTokenizer
.
It has two main methods:
.nextToken()
.pushBack()
.pushBack()
pushes the last token returned by .nextToken()
back
onto the stream so that it will be returned again by the
next .nextToken()
.
A tokenizer makes it easy to wmanually write a recursive descent parser for simple grammars like a BibTeX file.
Here is a blog post on writing a recursive descent parser with StreamTokenizer to parse boolean expressions:
https://unnikked.ga/how-to-evaluate-a-boolean-expression/
Update
Here is an example of how to roll your own parser - written in Python:
-
\$\begingroup\$ Thanks! These are all excellent suggestions. I had looked for existing parsers before writing this, but clearly not enough. I'll take a look at jbibtex and see if I can plug it in. \$\endgroup\$Mangara– Mangara2015年09月30日 11:35:53 +00:00Commented Sep 30, 2015 at 11:35
-
\$\begingroup\$ In case you still need to write your own parser, this page seems like it would be very helpful in understanding what the
bibtex
program accepts as valid input: A summary of BibTeX \$\endgroup\$ErikR– ErikR2015年09月30日 12:37:35 +00:00Commented Sep 30, 2015 at 12:37 -
\$\begingroup\$ You're right, that's a great resource. I already found one bug in my current implementation: it doesn't accept entries contained in pairs of parentheses. \$\endgroup\$Mangara– Mangara2015年09月30日 16:12:55 +00:00Commented Sep 30, 2015 at 16:12
-
\$\begingroup\$ Answer updated. I wrote some Python code which demonstrates one way to roll your own parser from scratch. Should be easily transferrable to Java. \$\endgroup\$ErikR– ErikR2015年09月30日 20:28:55 +00:00Commented Sep 30, 2015 at 20:28
You have very large if
code blocks that I can't help but wonder if it will be better to extract them as methods.
For CombinedReader
, maybe you can consider setting the StringReader
to null
when there is no more to be read from it:
private static class CombinedReader {
private StringReader stringReader;
// this you can make final
private final BufferedReader bufferedReader;
// note: reordered arguments so that it reads like
// the String is 'used' first, followed by the BufferedReader
public CombinedReader(String s, BufferedReader br) {
this.stringReader = new StringReader(s);
this.bufferedReader = br;
}
public int read() throws IOException {
if (stringReader != null) {
int c = stringReader.read();
if (c != -1) {
return c;
}
stringReader = null;
}
return bufferedReader.read();
}
}
edit: An example of extracting if
code blocks as methods for PublicationListParser
:
private void parseFileInternal(Path file) throws IOException, ParseException {
try (BufferedReader in = Files.newBufferedReader(file, Charset.forName("UTF-8"))) {
for (String l = in.readLine(); l != null; l = in.readLine()) {
String line = l.trim();
if (line.startsWith("@")) {
handleBibItem(BibItemParser.parseBibItem(
normalize(Tokenizer.collectBibItem(in, line))));
} else if (line.startsWith("<")) {
handleTag(TagParser.parseTag(
normalize(Tokenizer.collectTag(in, line))));
}
}
}
}
private static String normalize(String input) {
return input.replaceAll("\\s+", " ");
}
private void handleBibItem(BibItem item) {
if (item == null) {
return;
}
switch (item.getType()) {
case COMMENT:
case PREAMBLE:
break; // Ignore
case STRING:
// Add to abbreviations
abbreviations.put(item.get("short"), item.get("full"));
break;
default:
items.add(item);
}
}
private void handleTag(Tag tag) {
switch (tag.type) {
case Tag.Type.ABBREVIATION:
abbreviations.put(tag.values.get("short"), tag.values.get("full"));
break;
case Tag.Type.AUTHOR:
authors.put(tag.values.get("short"), tag.toAuthor());
break;
default:
throw new InternalError("Tag with unexpected type: " + tag);
}
}
-
\$\begingroup\$ Could you give a concrete example of one of my
if
blocks that would be better as a separate method? \$\endgroup\$Mangara– Mangara2015年09月30日 11:08:16 +00:00Commented Sep 30, 2015 at 11:08 -
\$\begingroup\$ @Mangara updated my answer with an example. \$\endgroup\$h.j.k.– h.j.k.2015年09月30日 15:27:12 +00:00Commented Sep 30, 2015 at 15:27
-
\$\begingroup\$ Thanks! And you're right, that's much clearer. I'll see if I can do the same thing elsewhere. \$\endgroup\$Mangara– Mangara2015年09月30日 16:15:11 +00:00Commented Sep 30, 2015 at 16:15
Path
argument. It would be much more flexible if it accepted anInputStream
. You can always add a convenience overload that opens and closes the file for the client and then delegates to the general method if you consider it helpful. \$\endgroup\$