10
\$\begingroup\$

My project here works upon output that comes out of a Tesseract OCR scan using hOCR format, then I read it with JDOM 2.0 and finally save it one of my own objects, which at a later point needs to be serializable. I have spotted one major codesmell, which is a for-loop of 5 levels deep.

An example hOCR output file:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <title></title>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 <meta name='ocr-system' content='tesseract 3.02' />
 <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
 <div class='ocr_page' id='page_1' title='image "D:\DPC2\converted60円60円.tiff"; bbox 0 0 2479 3508; ppageno 0'>
 <div class='ocr_carea' id='block_1_1' title="bbox 1690 267 2165 394">
 <p class='ocr_par' dir='ltr' id='par_1' title="bbox 1690 267 2165 394">
 <span class='ocr_line' id='line_1' title="bbox 1690 267 2165 394"><span class='ocrx_word' id='word_1' title="bbox 1690 267 2165 394"> </span> 
 </span>
 </p>
 </div>
 </div>
 </body>
</html>

In reality there can be many pages, areas, paragraphs, lines and words. I have only showed one of each here for simplicity and because I cannot disclose full files, also note that the content of the word shown here is " ", a string of one space.
A minor defect is that there are still some TODO comments which I need to transform to actual loggers.

First the Traversable structure:

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
public interface Traversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> {
 default public boolean hasParent() {
 return (getParent() != null);
 }
 public P getParent();
 default public boolean hasChildren() {
 return (getChildren().count() > 0);
 }
 public Stream<C> getChildren();
 public void setParent(final P parent);
 public void addChild(final C child);
 default public void addChildren(final Stream<? extends C> children) {
 children.forEach(this::addChild);
 }
 public String getId();
 public BoundingBox getBoundingBox();
 abstract public static class Void implements Traversable<Void, Void> {
 @Override
 public Void getParent() {
 return null;
 }
 @Override
 public Stream<Void> getChildren() {
 return Stream.empty();
 }
 @Override
 public void setParent(final Void parent) { }
 @Override
 public void addChild(final Void child) { }
 @Override
 public String getId() {
 return null;
 }
 @Override
 public BoundingBox getBoundingBox() {
 return null;
 }
 }
}

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
abstract public class AbstractTraversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> implements Traversable<P, C> {
 protected final Collection<C> children = new ArrayList<>();
 protected P parent;
 @Override
 public P getParent() {
 return parent;
 }
 @Override
 public Stream<C> getChildren() {
 return children.stream();
 }
 @Override
 public void setParent(final P parent) {
 this.parent = Objects.requireNonNull(parent);
 }
 @Override
 public void addChild(final C child) {
 this.children.add(Objects.requireNonNull(child));
 }
}

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
abstract public class FileElement<P extends FileElement<?, ?>, C extends FileElement<?, ?>> extends AbstractTraversable<P, C> {
 protected final String id;
 protected final BoundingBox boundingBox;
 public FileElement(final String id, final BoundingBox boundingBox) {
 this.id = Objects.requireNonNull(id);
 this.boundingBox = Objects.requireNonNull(boundingBox);
 }
 @Override
 public String getId() {
 return id;
 }
 @Override
 public BoundingBox getBoundingBox() {
 return boundingBox;
 }
 public static <E extends FileElement<?, ?>> E ofBoundingBoxElement(final Element element, final BiFunction<String, BoundingBox, E> constructor) {
 Objects.requireNonNull(element);
 Objects.requireNonNull(constructor);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 return constructor.apply(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle))
 );
 }
 abstract public static class Void extends FileElement<Void, Void> { 
 public Void(String id, BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 }
}

public class BoundingBox {
 private final int x1;
 private final int y1;
 private final int x2;
 private final int y2;
 public BoundingBox(final int x1, final int y1, final int x2, final int y2) {
 this.x1 = x1;
 this.y1 = y1;
 this.x2 = x2;
 this.y2 = y2;
 }
 public int getX1() {
 return x1;
 }
 public int getY1() {
 return y1;
 }
 public int getX2() {
 return x2;
 }
 public int getY2() {
 return y2;
 }
 @Override
 public String toString() {
 return BoundingBox.class.getSimpleName() + "(" + x1 + ", " + y1 + ", " + x2 + ", " + y2 + ")";
 }
 @Override
 public int hashCode() {
 int hash = 7;
 hash = 89 * hash + this.x1;
 hash = 89 * hash + this.y1;
 hash = 89 * hash + this.x2;
 hash = 89 * hash + this.y2;
 return hash;
 }
 @Override
 public boolean equals(Object obj) {
 if (obj == null) {
 return false;
 }
 if (getClass() != obj.getClass()) {
 return false;
 }
 final BoundingBox other = (BoundingBox) obj;
 if (this.x1 != other.x1) {
 return false;
 }
 if (this.y1 != other.y1) {
 return false;
 }
 if (this.x2 != other.x2) {
 return false;
 }
 if (this.y2 != other.y2) {
 return false;
 }
 return true;
 }
}

public class Title {
 private static final Map<String, BiConsumer<Title, Stream<String>>> KEYWORD_CONSUMERS = new HashMap<>();
 static {
 KEYWORD_CONSUMERS.put("bbox", new BoundingBoxConsumer());
 KEYWORD_CONSUMERS.put("image", new ImagePathConsumer());
 KEYWORD_CONSUMERS.put("ppageno", new PageNumberConsumer());
 }
 private Optional<BoundingBox> boudingBox = Optional.empty();
 private Optional<Path> imagePath = Optional.empty();
 private OptionalInt pageNumber = OptionalInt.empty();
 public Title(final String title) {
 process(Objects.requireNonNull(title));
 }
 private void process(final String title) {
 for (String part : splitTitle(title)) {
 List<String> parts = Arrays.stream(part.split(" "))
 .map(String::trim)
 .filter(str -> !str.isEmpty())
 .collect(Collectors.toList());
 if (parts.size() <= 1) {
 //TODO logger, ignoring because x
 continue;
 }
 String keyword = parts.get(0);
 if (!KEYWORD_CONSUMERS.containsKey(keyword)) {
 //TODO logger, zz not found
 continue;
 }
 KEYWORD_CONSUMERS.get(keyword).accept(this, parts.subList(1, parts.size()).stream());
 }
 }
 private List<String> splitTitle(final String input) {
 return Arrays.asList(input.split(";(?=([^\"]*\"[^\"]*\")*[^\"]*$)"));
 }
 private void setBoundingBox(final BoundingBox boundingBox) {
 Objects.requireNonNull(boundingBox);
 this.boudingBox = Optional.of(boundingBox);
 }
 private void setImagePath(final Path imagePath) {
 Objects.requireNonNull(imagePath);
 this.imagePath = Optional.of(imagePath);
 }
 private void setPageNumber(final int pageNumber) {
 this.pageNumber = OptionalInt.of(pageNumber);
 }
 public Optional<BoundingBox> getBoundingBox() {
 return boudingBox;
 }
 public Optional<Path> getImagePath() {
 return imagePath;
 }
 public OptionalInt getPageNumber() {
 return pageNumber;
 }
 @Override
 public String toString() {
 return Title.class.getSimpleName() + "(" + boudingBox + ", " + imagePath + ", " + pageNumber + ")";
 }
 private static class BoundingBoxConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
 if (arguments.size() != 4) {
 //TODO logger, ignoring because y
 }
 title.setBoundingBox(new BoundingBox(arguments.get(0), arguments.get(1), arguments.get(2), arguments.get(3)));
 }
 }
 private static class ImagePathConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Path> arguments = stream.map(str -> str.replace("\"", "")).map(Paths::get).collect(Collectors.toList());
 if (arguments.size() != 1) {
 //TODO logger, ignoring because yy
 }
 title.setImagePath(arguments.get(0));
 }
 }
 private static class PageNumberConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
 if (arguments.size() != 1) {
 //TODO logger, ignoring because yyy
 }
 title.setPageNumber(arguments.get(0));
 }
 }
}

public class Page extends FileElement<FileElement.Void, Area> {
 private final Path imagePath;
 private final int pageNumber;
 public static Page of(final Element element) {
 Objects.requireNonNull(element);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 return new Page(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
 title.getImagePath().orElseThrow(() -> new IllegalStateException("No image path present in: " + elementTitle)),
 title.getPageNumber().orElseThrow(() -> new IllegalStateException("No page number present in: " + elementTitle))
 );
 }
 public Page(final String id, final BoundingBox boundingBox, final Path imagePath, final int pageNumber) {
 super(id, boundingBox);
 this.imagePath = Objects.requireNonNull(imagePath);
 this.pageNumber = Objects.requireNonNull(pageNumber);
 }
 @Override
 public FileElement.Void getParent() {
 return null;
 }
 @Override
 public void setParent(final FileElement.Void parent) {
 throw new IllegalStateException("The Page element has no parent.");
 }
 public Path getImagePath() {
 return imagePath;
 }
 public int getPageNumber() {
 return pageNumber;
 }
}

public class Area extends FileElement<Page, Paragraph> {
 public Area(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Area of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Area::new);
 }
}

public class Paragraph extends FileElement<Area, Line> {
 public Paragraph(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Paragraph of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Paragraph::new);
 }
}

public class Line extends FileElement<Paragraph, Word> {
 public Line(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Line of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Line::new);
 }
}

public class Word extends FileElement<Line, FileElement.Void> {
 private final String content;
 private final boolean strong;
 public Word(final String id, final BoundingBox boundingBox, final String content, final boolean strong) {
 super(id, boundingBox);
 this.content = Objects.requireNonNull(content);
 this.strong = strong;
 }
 public static final Word of(final Element element) {
 Objects.requireNonNull(element);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 Element child = element.getChild("strong");
 return new Word(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
 child == null ? element.getText() : child.getText(),
 child != null
 );
 }
 @Override
 public Stream<Void> getChildren() {
 return Stream.empty();
 }
 @Override
 public void addChild(final Void child) {
 throw new IllegalStateException("The Word element has no children.");
 }
 public String getContent() {
 return content;
 }
 public boolean isStrong() {
 return strong;
 }
}

The usage:

Note that the ScannedFile is not complete (but working) yet, I will put convienience methods in there once I need them.

public class ScannedFile implements Serializable {
 private static final long serialVersionUID = 859948374926589L;
 private final List<Page> pages = new ArrayList<>();
 public void addChild(final Page page) {
 pages.add(page);
 }
 public Stream<Page> getPages() {
 return pages.stream();
 }
}

The actual XML reading, which has some code smell in it:

public static ScannedFile createScannedFileFromHOCR(final Path directory) {
 try {
 Objects.requireNonNull(directory);
 Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
 SAXBuilder xmlBuilder = new SAXBuilder();
 Document document = xmlBuilder.build(htmlFile.toFile());
 ScannedFile scannedFile = new ScannedFile();
 XPathFactory factory = XPathFactory.instance();
 System.out.println("factory = " + factory);
 XPathExpression<Element> xpePages = factory.compile( //arguments are untyped?
 "/h:html/h:body/h:div[@class='ocr_page']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 System.out.println("xpePages = " + xpePages);
 List<Element> pages = xpePages.evaluate(document);
 System.out.println("pages = " + pages);
 for (Element pageElement : pages) {
 Page page = Page.of(pageElement);
 XPathExpression<Element> xpeAreas = factory.compile(
 "h:div[@class='ocr_carea']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> areas = xpeAreas.evaluate(pageElement);
 for (Element areaElement : areas) {
 Area area = Area.of(areaElement);
 area.setParent(page);
 page.addChild(area);
 XPathExpression<Element> xpeParagraphs = factory.compile(
 "h:p[@class='ocr_par']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> paragraphs = xpeParagraphs.evaluate(areaElement);
 for (Element paragraphElement : paragraphs) {
 Paragraph paragraph = Paragraph.of(paragraphElement);
 paragraph.setParent(area);
 area.addChild(paragraph);
 XPathExpression<Element> xpeLines = factory.compile(
 "h:span[@class='ocr_line']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> lines = xpeLines.evaluate(paragraphElement);
 for (Element lineElement : lines) {
 Line line = Line.of(lineElement);
 line.setParent(paragraph);
 paragraph.addChild(line);
 XPathExpression<Element> xpeWords = factory.compile(
 "h:span[@class='ocrx_word']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> words = xpeWords.evaluate(lineElement);
 for (Element wordElement : words) {
 Word word = Word.of(wordElement);
 word.setParent(line);
 line.addChild(word);
 }
 }
 }
 }
 scannedFile.addChild(page);
 }
 return scannedFile;
 } catch (JDOMException | IOException ex) {
 ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
 throw new AutomaticExecutionFailedException(ex);
 }
}

An example usage:

ScannedFile scannedFile = OCRUtils.createScannedFileFromHOCR(path);
System.out.println("Created ScannedFile");
long elementCount = scannedFile.getPages()
 .flatMap(Page::getChildren)
 .flatMap(Area::getChildren)
 .flatMap(Paragraph::getChildren)
 .flatMap(Line::getChildren)
 .count();
List<String> wordList = scannedFile.getPages()
 .flatMap(Page::getChildren)
 .flatMap(Area::getChildren)
 .flatMap(Paragraph::getChildren)
 .flatMap(Line::getChildren)
 .map(Word::getContent)
 .collect(Collectors.toList());
System.out.println("Word count: " + elementCount);
System.out.println("Word list: " + wordList);
asked Apr 30, 2014 at 11:52
\$\endgroup\$

1 Answer 1

4
\$\begingroup\$

Looking at just the JDOM portion, there are a couple of tricks you can play.

Unfortunately, XML and streams will always be an uncomfortable mix...

The following factors in JDOM make better performance possible:

  • DRY
    • XPathExpressions are threadsafe. Reuse them
    • Namespaces are threadsafe, and immutable. Reuse them
    • SAXBuilder has a relatively slow setup time (it needs to query installed XML parsers, etc.). JDOM can reuse a single configuration using a SAXEngine....

It is great that you are using relative XPath queries, and using the smaller context for them. This makes a big performance difference.

This is about as neat as I have seen this work done. It is good. I am not sure you will appreciate it if I were to suggest this is really good, and that the alternative tools/libraries (other than JDOM) would be much harder to get right.

Putting these things together, your createScannedFileFromHOCR method can become the following collection of statics (fully thread-safe, and better performing):

private static final XPathFactory factory = XPathFactory.instance();
private static final Namespace XHTML = Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml");
private static final XPathExpression<Element> OCR_PAGE = factory.compile( //arguments are untyped?
 "/h:html/h:body/h:div[@class='ocr_page']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_CAREA = factory.compile(
 "h:div[@class='ocr_carea']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_PAR = factory.compile(
 "h:p[@class='ocr_par']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_LINE = factory.compile(
 "h:span[@class='ocr_line']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCRX_WORD = factory.compile(
 "h:span[@class='ocrx_word']",
 Filters.element(),
 null,
 XHTML
 );
private static final SAXBuilder XMLBUILDER = new SAXBuilder();
private static final ThreadLocal<SAXEngine> XMLENGINE = new ThreadLocal<SAXEngine>() {
 @Override
 protected SAXEngine initialValue() {
 try {
 return XMLBUILDER.buildEngine();
 } catch (JDOMException e) {
 throw new IllegalStateException("Unable to build Engine", e);
 }
 }
};
public static ScannedFile createScannedFileFromHOCR(final Path directory) {
 try {
 Objects.requireNonNull(directory);
 Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
 Document document = XMLENGINE.get().build(htmlFile.toFile());
 ScannedFile scannedFile = new ScannedFile();
 System.out.println("factory = " + factory);
 System.out.println("xpePages = " + OCR_PAGE);
 List<Element> pages = OCR_PAGE.evaluate(document);
 System.out.println("pages = " + pages);
 for (Element pageElement : pages) {
 Page page = Page.of(pageElement);
 List<Element> areas = OCR_CAREA.evaluate(pageElement);
 for (Element areaElement : areas) {
 Area area = Area.of(areaElement);
 area.setParent(page);
 page.addChild(area);
 List<Element> paragraphs = OCR_PAR.evaluate(areaElement);
 for (Element paragraphElement : paragraphs) {
 Paragraph paragraph = Paragraph.of(paragraphElement);
 paragraph.setParent(area);
 area.addChild(paragraph);
 List<Element> lines = OCR_LINE.evaluate(paragraphElement);
 for (Element lineElement : lines) {
 Line line = Line.of(lineElement);
 line.setParent(paragraph);
 paragraph.addChild(line);
 List<Element> words = OCRX_WORD.evaluate(lineElement);
 for (Element wordElement : words) {
 Word word = Word.of(wordElement);
 word.setParent(line);
 line.addChild(word);
 }
 }
 }
 }
 scannedFile.addChild(page);
 }
 return scannedFile;
 } catch (JDOMException | IOException ex) {
 ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
 throw new AutomaticExecutionFailedException(ex);
 }
}
answered Apr 30, 2014 at 12:19
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.