My project here works upon output that comes out of a Tesseract OCR scan using hOCR format, then I read it with JDOM 2.0 and finally save it one of my own objects, which at a later point needs to be serializable. I have spotted one major codesmell, which is a for-loop of 5 levels deep.
An example hOCR output file:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "D:\DPC2\converted60円60円.tiff"; bbox 0 0 2479 3508; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 1690 267 2165 394">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 1690 267 2165 394">
<span class='ocr_line' id='line_1' title="bbox 1690 267 2165 394"><span class='ocrx_word' id='word_1' title="bbox 1690 267 2165 394"> </span>
</span>
</p>
</div>
</div>
</body>
</html>
In reality there can be many pages, areas, paragraphs, lines and words. I have only showed one of each here for simplicity and because I cannot disclose full files, also note that the content of the word shown here is " ", a string of one space.
A minor defect is that there are still some TODO comments which I need to transform to actual loggers.
First the Traversable
structure:
/**
*
* @author Frank van Heeswijk
* @param <P> The type of the parent
* @param <C> The type of the children
*/
public interface Traversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> {
default public boolean hasParent() {
return (getParent() != null);
}
public P getParent();
default public boolean hasChildren() {
return (getChildren().count() > 0);
}
public Stream<C> getChildren();
public void setParent(final P parent);
public void addChild(final C child);
default public void addChildren(final Stream<? extends C> children) {
children.forEach(this::addChild);
}
public String getId();
public BoundingBox getBoundingBox();
abstract public static class Void implements Traversable<Void, Void> {
@Override
public Void getParent() {
return null;
}
@Override
public Stream<Void> getChildren() {
return Stream.empty();
}
@Override
public void setParent(final Void parent) { }
@Override
public void addChild(final Void child) { }
@Override
public String getId() {
return null;
}
@Override
public BoundingBox getBoundingBox() {
return null;
}
}
}
/**
*
* @author Frank van Heeswijk
* @param <P> The type of the parent
* @param <C> The type of the children
*/
abstract public class AbstractTraversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> implements Traversable<P, C> {
protected final Collection<C> children = new ArrayList<>();
protected P parent;
@Override
public P getParent() {
return parent;
}
@Override
public Stream<C> getChildren() {
return children.stream();
}
@Override
public void setParent(final P parent) {
this.parent = Objects.requireNonNull(parent);
}
@Override
public void addChild(final C child) {
this.children.add(Objects.requireNonNull(child));
}
}
/**
*
* @author Frank van Heeswijk
* @param <P> The type of the parent
* @param <C> The type of the children
*/
abstract public class FileElement<P extends FileElement<?, ?>, C extends FileElement<?, ?>> extends AbstractTraversable<P, C> {
protected final String id;
protected final BoundingBox boundingBox;
public FileElement(final String id, final BoundingBox boundingBox) {
this.id = Objects.requireNonNull(id);
this.boundingBox = Objects.requireNonNull(boundingBox);
}
@Override
public String getId() {
return id;
}
@Override
public BoundingBox getBoundingBox() {
return boundingBox;
}
public static <E extends FileElement<?, ?>> E ofBoundingBoxElement(final Element element, final BiFunction<String, BoundingBox, E> constructor) {
Objects.requireNonNull(element);
Objects.requireNonNull(constructor);
String elementId = element.getAttributeValue("id");
String elementTitle = element.getAttributeValue("title");
Title title = new Title(elementTitle);
return constructor.apply(
elementId,
title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle))
);
}
abstract public static class Void extends FileElement<Void, Void> {
public Void(String id, BoundingBox boundingBox) {
super(id, boundingBox);
}
}
}
public class BoundingBox {
private final int x1;
private final int y1;
private final int x2;
private final int y2;
public BoundingBox(final int x1, final int y1, final int x2, final int y2) {
this.x1 = x1;
this.y1 = y1;
this.x2 = x2;
this.y2 = y2;
}
public int getX1() {
return x1;
}
public int getY1() {
return y1;
}
public int getX2() {
return x2;
}
public int getY2() {
return y2;
}
@Override
public String toString() {
return BoundingBox.class.getSimpleName() + "(" + x1 + ", " + y1 + ", " + x2 + ", " + y2 + ")";
}
@Override
public int hashCode() {
int hash = 7;
hash = 89 * hash + this.x1;
hash = 89 * hash + this.y1;
hash = 89 * hash + this.x2;
hash = 89 * hash + this.y2;
return hash;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final BoundingBox other = (BoundingBox) obj;
if (this.x1 != other.x1) {
return false;
}
if (this.y1 != other.y1) {
return false;
}
if (this.x2 != other.x2) {
return false;
}
if (this.y2 != other.y2) {
return false;
}
return true;
}
}
public class Title {
private static final Map<String, BiConsumer<Title, Stream<String>>> KEYWORD_CONSUMERS = new HashMap<>();
static {
KEYWORD_CONSUMERS.put("bbox", new BoundingBoxConsumer());
KEYWORD_CONSUMERS.put("image", new ImagePathConsumer());
KEYWORD_CONSUMERS.put("ppageno", new PageNumberConsumer());
}
private Optional<BoundingBox> boudingBox = Optional.empty();
private Optional<Path> imagePath = Optional.empty();
private OptionalInt pageNumber = OptionalInt.empty();
public Title(final String title) {
process(Objects.requireNonNull(title));
}
private void process(final String title) {
for (String part : splitTitle(title)) {
List<String> parts = Arrays.stream(part.split(" "))
.map(String::trim)
.filter(str -> !str.isEmpty())
.collect(Collectors.toList());
if (parts.size() <= 1) {
//TODO logger, ignoring because x
continue;
}
String keyword = parts.get(0);
if (!KEYWORD_CONSUMERS.containsKey(keyword)) {
//TODO logger, zz not found
continue;
}
KEYWORD_CONSUMERS.get(keyword).accept(this, parts.subList(1, parts.size()).stream());
}
}
private List<String> splitTitle(final String input) {
return Arrays.asList(input.split(";(?=([^\"]*\"[^\"]*\")*[^\"]*$)"));
}
private void setBoundingBox(final BoundingBox boundingBox) {
Objects.requireNonNull(boundingBox);
this.boudingBox = Optional.of(boundingBox);
}
private void setImagePath(final Path imagePath) {
Objects.requireNonNull(imagePath);
this.imagePath = Optional.of(imagePath);
}
private void setPageNumber(final int pageNumber) {
this.pageNumber = OptionalInt.of(pageNumber);
}
public Optional<BoundingBox> getBoundingBox() {
return boudingBox;
}
public Optional<Path> getImagePath() {
return imagePath;
}
public OptionalInt getPageNumber() {
return pageNumber;
}
@Override
public String toString() {
return Title.class.getSimpleName() + "(" + boudingBox + ", " + imagePath + ", " + pageNumber + ")";
}
private static class BoundingBoxConsumer implements BiConsumer<Title, Stream<String>> {
@Override
public void accept(final Title title, final Stream<String> stream) {
Objects.requireNonNull(title);
Objects.requireNonNull(stream);
List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
if (arguments.size() != 4) {
//TODO logger, ignoring because y
}
title.setBoundingBox(new BoundingBox(arguments.get(0), arguments.get(1), arguments.get(2), arguments.get(3)));
}
}
private static class ImagePathConsumer implements BiConsumer<Title, Stream<String>> {
@Override
public void accept(final Title title, final Stream<String> stream) {
Objects.requireNonNull(title);
Objects.requireNonNull(stream);
List<Path> arguments = stream.map(str -> str.replace("\"", "")).map(Paths::get).collect(Collectors.toList());
if (arguments.size() != 1) {
//TODO logger, ignoring because yy
}
title.setImagePath(arguments.get(0));
}
}
private static class PageNumberConsumer implements BiConsumer<Title, Stream<String>> {
@Override
public void accept(final Title title, final Stream<String> stream) {
Objects.requireNonNull(title);
Objects.requireNonNull(stream);
List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
if (arguments.size() != 1) {
//TODO logger, ignoring because yyy
}
title.setPageNumber(arguments.get(0));
}
}
}
public class Page extends FileElement<FileElement.Void, Area> {
private final Path imagePath;
private final int pageNumber;
public static Page of(final Element element) {
Objects.requireNonNull(element);
String elementId = element.getAttributeValue("id");
String elementTitle = element.getAttributeValue("title");
Title title = new Title(elementTitle);
return new Page(
elementId,
title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
title.getImagePath().orElseThrow(() -> new IllegalStateException("No image path present in: " + elementTitle)),
title.getPageNumber().orElseThrow(() -> new IllegalStateException("No page number present in: " + elementTitle))
);
}
public Page(final String id, final BoundingBox boundingBox, final Path imagePath, final int pageNumber) {
super(id, boundingBox);
this.imagePath = Objects.requireNonNull(imagePath);
this.pageNumber = Objects.requireNonNull(pageNumber);
}
@Override
public FileElement.Void getParent() {
return null;
}
@Override
public void setParent(final FileElement.Void parent) {
throw new IllegalStateException("The Page element has no parent.");
}
public Path getImagePath() {
return imagePath;
}
public int getPageNumber() {
return pageNumber;
}
}
public class Area extends FileElement<Page, Paragraph> {
public Area(final String id, final BoundingBox boundingBox) {
super(id, boundingBox);
}
public static final Area of(final Element element) {
return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Area::new);
}
}
public class Paragraph extends FileElement<Area, Line> {
public Paragraph(final String id, final BoundingBox boundingBox) {
super(id, boundingBox);
}
public static final Paragraph of(final Element element) {
return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Paragraph::new);
}
}
public class Line extends FileElement<Paragraph, Word> {
public Line(final String id, final BoundingBox boundingBox) {
super(id, boundingBox);
}
public static final Line of(final Element element) {
return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Line::new);
}
}
public class Word extends FileElement<Line, FileElement.Void> {
private final String content;
private final boolean strong;
public Word(final String id, final BoundingBox boundingBox, final String content, final boolean strong) {
super(id, boundingBox);
this.content = Objects.requireNonNull(content);
this.strong = strong;
}
public static final Word of(final Element element) {
Objects.requireNonNull(element);
String elementId = element.getAttributeValue("id");
String elementTitle = element.getAttributeValue("title");
Title title = new Title(elementTitle);
Element child = element.getChild("strong");
return new Word(
elementId,
title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
child == null ? element.getText() : child.getText(),
child != null
);
}
@Override
public Stream<Void> getChildren() {
return Stream.empty();
}
@Override
public void addChild(final Void child) {
throw new IllegalStateException("The Word element has no children.");
}
public String getContent() {
return content;
}
public boolean isStrong() {
return strong;
}
}
The usage:
Note that the ScannedFile
is not complete (but working) yet, I will put convienience methods in there once I need them.
public class ScannedFile implements Serializable {
private static final long serialVersionUID = 859948374926589L;
private final List<Page> pages = new ArrayList<>();
public void addChild(final Page page) {
pages.add(page);
}
public Stream<Page> getPages() {
return pages.stream();
}
}
The actual XML reading, which has some code smell in it:
public static ScannedFile createScannedFileFromHOCR(final Path directory) {
try {
Objects.requireNonNull(directory);
Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
SAXBuilder xmlBuilder = new SAXBuilder();
Document document = xmlBuilder.build(htmlFile.toFile());
ScannedFile scannedFile = new ScannedFile();
XPathFactory factory = XPathFactory.instance();
System.out.println("factory = " + factory);
XPathExpression<Element> xpePages = factory.compile( //arguments are untyped?
"/h:html/h:body/h:div[@class='ocr_page']",
Filters.element(),
null,
Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
);
System.out.println("xpePages = " + xpePages);
List<Element> pages = xpePages.evaluate(document);
System.out.println("pages = " + pages);
for (Element pageElement : pages) {
Page page = Page.of(pageElement);
XPathExpression<Element> xpeAreas = factory.compile(
"h:div[@class='ocr_carea']",
Filters.element(),
null,
Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
);
List<Element> areas = xpeAreas.evaluate(pageElement);
for (Element areaElement : areas) {
Area area = Area.of(areaElement);
area.setParent(page);
page.addChild(area);
XPathExpression<Element> xpeParagraphs = factory.compile(
"h:p[@class='ocr_par']",
Filters.element(),
null,
Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
);
List<Element> paragraphs = xpeParagraphs.evaluate(areaElement);
for (Element paragraphElement : paragraphs) {
Paragraph paragraph = Paragraph.of(paragraphElement);
paragraph.setParent(area);
area.addChild(paragraph);
XPathExpression<Element> xpeLines = factory.compile(
"h:span[@class='ocr_line']",
Filters.element(),
null,
Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
);
List<Element> lines = xpeLines.evaluate(paragraphElement);
for (Element lineElement : lines) {
Line line = Line.of(lineElement);
line.setParent(paragraph);
paragraph.addChild(line);
XPathExpression<Element> xpeWords = factory.compile(
"h:span[@class='ocrx_word']",
Filters.element(),
null,
Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
);
List<Element> words = xpeWords.evaluate(lineElement);
for (Element wordElement : words) {
Word word = Word.of(wordElement);
word.setParent(line);
line.addChild(word);
}
}
}
}
scannedFile.addChild(page);
}
return scannedFile;
} catch (JDOMException | IOException ex) {
ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
throw new AutomaticExecutionFailedException(ex);
}
}
An example usage:
ScannedFile scannedFile = OCRUtils.createScannedFileFromHOCR(path);
System.out.println("Created ScannedFile");
long elementCount = scannedFile.getPages()
.flatMap(Page::getChildren)
.flatMap(Area::getChildren)
.flatMap(Paragraph::getChildren)
.flatMap(Line::getChildren)
.count();
List<String> wordList = scannedFile.getPages()
.flatMap(Page::getChildren)
.flatMap(Area::getChildren)
.flatMap(Paragraph::getChildren)
.flatMap(Line::getChildren)
.map(Word::getContent)
.collect(Collectors.toList());
System.out.println("Word count: " + elementCount);
System.out.println("Word list: " + wordList);
1 Answer 1
Looking at just the JDOM portion, there are a couple of tricks you can play.
Unfortunately, XML and streams will always be an uncomfortable mix...
The following factors in JDOM make better performance possible:
- DRY
- XPathExpressions are threadsafe. Reuse them
- Namespaces are threadsafe, and immutable. Reuse them
- SAXBuilder has a relatively slow setup time (it needs to query installed XML parsers, etc.). JDOM can reuse a single configuration using a SAXEngine....
It is great that you are using relative XPath queries, and using the smaller context for them. This makes a big performance difference.
This is about as neat as I have seen this work done. It is good. I am not sure you will appreciate it if I were to suggest this is really good, and that the alternative tools/libraries (other than JDOM) would be much harder to get right.
Putting these things together, your createScannedFileFromHOCR
method can become the following collection of statics (fully thread-safe, and better performing):
private static final XPathFactory factory = XPathFactory.instance();
private static final Namespace XHTML = Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml");
private static final XPathExpression<Element> OCR_PAGE = factory.compile( //arguments are untyped?
"/h:html/h:body/h:div[@class='ocr_page']",
Filters.element(),
null,
XHTML
);
private static final XPathExpression<Element> OCR_CAREA = factory.compile(
"h:div[@class='ocr_carea']",
Filters.element(),
null,
XHTML
);
private static final XPathExpression<Element> OCR_PAR = factory.compile(
"h:p[@class='ocr_par']",
Filters.element(),
null,
XHTML
);
private static final XPathExpression<Element> OCR_LINE = factory.compile(
"h:span[@class='ocr_line']",
Filters.element(),
null,
XHTML
);
private static final XPathExpression<Element> OCRX_WORD = factory.compile(
"h:span[@class='ocrx_word']",
Filters.element(),
null,
XHTML
);
private static final SAXBuilder XMLBUILDER = new SAXBuilder();
private static final ThreadLocal<SAXEngine> XMLENGINE = new ThreadLocal<SAXEngine>() {
@Override
protected SAXEngine initialValue() {
try {
return XMLBUILDER.buildEngine();
} catch (JDOMException e) {
throw new IllegalStateException("Unable to build Engine", e);
}
}
};
public static ScannedFile createScannedFileFromHOCR(final Path directory) {
try {
Objects.requireNonNull(directory);
Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
Document document = XMLENGINE.get().build(htmlFile.toFile());
ScannedFile scannedFile = new ScannedFile();
System.out.println("factory = " + factory);
System.out.println("xpePages = " + OCR_PAGE);
List<Element> pages = OCR_PAGE.evaluate(document);
System.out.println("pages = " + pages);
for (Element pageElement : pages) {
Page page = Page.of(pageElement);
List<Element> areas = OCR_CAREA.evaluate(pageElement);
for (Element areaElement : areas) {
Area area = Area.of(areaElement);
area.setParent(page);
page.addChild(area);
List<Element> paragraphs = OCR_PAR.evaluate(areaElement);
for (Element paragraphElement : paragraphs) {
Paragraph paragraph = Paragraph.of(paragraphElement);
paragraph.setParent(area);
area.addChild(paragraph);
List<Element> lines = OCR_LINE.evaluate(paragraphElement);
for (Element lineElement : lines) {
Line line = Line.of(lineElement);
line.setParent(paragraph);
paragraph.addChild(line);
List<Element> words = OCRX_WORD.evaluate(lineElement);
for (Element wordElement : words) {
Word word = Word.of(wordElement);
word.setParent(line);
line.addChild(word);
}
}
}
}
scannedFile.addChild(page);
}
return scannedFile;
} catch (JDOMException | IOException ex) {
ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
throw new AutomaticExecutionFailedException(ex);
}
}