Saving an XML file created by Tesseract OCR as a traversable object

Question 1

My project here works upon output that comes out of a Tesseract OCR scan using hOCR format, then I read it with JDOM 2.0 and finally save it one of my own objects, which at a later point needs to be serializable. I have spotted one major codesmell, which is a for-loop of 5 levels deep.

An example hOCR output file:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <title></title>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 <meta name='ocr-system' content='tesseract 3.02' />
 <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
 <div class='ocr_page' id='page_1' title='image "D:\DPC2\converted60円60円.tiff"; bbox 0 0 2479 3508; ppageno 0'>
 <div class='ocr_carea' id='block_1_1' title="bbox 1690 267 2165 394">
 <p class='ocr_par' dir='ltr' id='par_1' title="bbox 1690 267 2165 394">
 <span class='ocr_line' id='line_1' title="bbox 1690 267 2165 394"><span class='ocrx_word' id='word_1' title="bbox 1690 267 2165 394"> </span> 
 </span>
 </p>
 </div>
 </div>
 </body>
</html>

In reality there can be many pages, areas, paragraphs, lines and words. I have only showed one of each here for simplicity and because I cannot disclose full files, also note that the content of the word shown here is " ", a string of one space.
A minor defect is that there are still some TODO comments which I need to transform to actual loggers.

First the Traversable structure:

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
public interface Traversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> {
 default public boolean hasParent() {
 return (getParent() != null);
 }
 public P getParent();
 default public boolean hasChildren() {
 return (getChildren().count() > 0);
 }
 public Stream<C> getChildren();
 public void setParent(final P parent);
 public void addChild(final C child);
 default public void addChildren(final Stream<? extends C> children) {
 children.forEach(this::addChild);
 }
 public String getId();
 public BoundingBox getBoundingBox();
 abstract public static class Void implements Traversable<Void, Void> {
 @Override
 public Void getParent() {
 return null;
 }
 @Override
 public Stream<Void> getChildren() {
 return Stream.empty();
 }
 @Override
 public void setParent(final Void parent) { }
 @Override
 public void addChild(final Void child) { }
 @Override
 public String getId() {
 return null;
 }
 @Override
 public BoundingBox getBoundingBox() {
 return null;
 }
 }
}

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
abstract public class AbstractTraversable<P extends Traversable<?, ?>, C extends Traversable<?, ?>> implements Traversable<P, C> {
 protected final Collection<C> children = new ArrayList<>();
 protected P parent;
 @Override
 public P getParent() {
 return parent;
 }
 @Override
 public Stream<C> getChildren() {
 return children.stream();
 }
 @Override
 public void setParent(final P parent) {
 this.parent = Objects.requireNonNull(parent);
 }
 @Override
 public void addChild(final C child) {
 this.children.add(Objects.requireNonNull(child));
 }
}

/**
 *
 * @author Frank van Heeswijk
 * @param <P> The type of the parent
 * @param <C> The type of the children
 */
abstract public class FileElement<P extends FileElement<?, ?>, C extends FileElement<?, ?>> extends AbstractTraversable<P, C> {
 protected final String id;
 protected final BoundingBox boundingBox;
 public FileElement(final String id, final BoundingBox boundingBox) {
 this.id = Objects.requireNonNull(id);
 this.boundingBox = Objects.requireNonNull(boundingBox);
 }
 @Override
 public String getId() {
 return id;
 }
 @Override
 public BoundingBox getBoundingBox() {
 return boundingBox;
 }
 public static <E extends FileElement<?, ?>> E ofBoundingBoxElement(final Element element, final BiFunction<String, BoundingBox, E> constructor) {
 Objects.requireNonNull(element);
 Objects.requireNonNull(constructor);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 return constructor.apply(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle))
 );
 }
 abstract public static class Void extends FileElement<Void, Void> { 
 public Void(String id, BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 }
}

public class BoundingBox {
 private final int x1;
 private final int y1;
 private final int x2;
 private final int y2;
 public BoundingBox(final int x1, final int y1, final int x2, final int y2) {
 this.x1 = x1;
 this.y1 = y1;
 this.x2 = x2;
 this.y2 = y2;
 }
 public int getX1() {
 return x1;
 }
 public int getY1() {
 return y1;
 }
 public int getX2() {
 return x2;
 }
 public int getY2() {
 return y2;
 }
 @Override
 public String toString() {
 return BoundingBox.class.getSimpleName() + "(" + x1 + ", " + y1 + ", " + x2 + ", " + y2 + ")";
 }
 @Override
 public int hashCode() {
 int hash = 7;
 hash = 89 * hash + this.x1;
 hash = 89 * hash + this.y1;
 hash = 89 * hash + this.x2;
 hash = 89 * hash + this.y2;
 return hash;
 }
 @Override
 public boolean equals(Object obj) {
 if (obj == null) {
 return false;
 }
 if (getClass() != obj.getClass()) {
 return false;
 }
 final BoundingBox other = (BoundingBox) obj;
 if (this.x1 != other.x1) {
 return false;
 }
 if (this.y1 != other.y1) {
 return false;
 }
 if (this.x2 != other.x2) {
 return false;
 }
 if (this.y2 != other.y2) {
 return false;
 }
 return true;
 }
}

public class Title {
 private static final Map<String, BiConsumer<Title, Stream<String>>> KEYWORD_CONSUMERS = new HashMap<>();
 static {
 KEYWORD_CONSUMERS.put("bbox", new BoundingBoxConsumer());
 KEYWORD_CONSUMERS.put("image", new ImagePathConsumer());
 KEYWORD_CONSUMERS.put("ppageno", new PageNumberConsumer());
 }
 private Optional<BoundingBox> boudingBox = Optional.empty();
 private Optional<Path> imagePath = Optional.empty();
 private OptionalInt pageNumber = OptionalInt.empty();
 public Title(final String title) {
 process(Objects.requireNonNull(title));
 }
 private void process(final String title) {
 for (String part : splitTitle(title)) {
 List<String> parts = Arrays.stream(part.split(" "))
 .map(String::trim)
 .filter(str -> !str.isEmpty())
 .collect(Collectors.toList());
 if (parts.size() <= 1) {
 //TODO logger, ignoring because x
 continue;
 }
 String keyword = parts.get(0);
 if (!KEYWORD_CONSUMERS.containsKey(keyword)) {
 //TODO logger, zz not found
 continue;
 }
 KEYWORD_CONSUMERS.get(keyword).accept(this, parts.subList(1, parts.size()).stream());
 }
 }
 private List<String> splitTitle(final String input) {
 return Arrays.asList(input.split(";(?=([^\"]*\"[^\"]*\")*[^\"]*$)"));
 }
 private void setBoundingBox(final BoundingBox boundingBox) {
 Objects.requireNonNull(boundingBox);
 this.boudingBox = Optional.of(boundingBox);
 }
 private void setImagePath(final Path imagePath) {
 Objects.requireNonNull(imagePath);
 this.imagePath = Optional.of(imagePath);
 }
 private void setPageNumber(final int pageNumber) {
 this.pageNumber = OptionalInt.of(pageNumber);
 }
 public Optional<BoundingBox> getBoundingBox() {
 return boudingBox;
 }
 public Optional<Path> getImagePath() {
 return imagePath;
 }
 public OptionalInt getPageNumber() {
 return pageNumber;
 }
 @Override
 public String toString() {
 return Title.class.getSimpleName() + "(" + boudingBox + ", " + imagePath + ", " + pageNumber + ")";
 }
 private static class BoundingBoxConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
 if (arguments.size() != 4) {
 //TODO logger, ignoring because y
 }
 title.setBoundingBox(new BoundingBox(arguments.get(0), arguments.get(1), arguments.get(2), arguments.get(3)));
 }
 }
 private static class ImagePathConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Path> arguments = stream.map(str -> str.replace("\"", "")).map(Paths::get).collect(Collectors.toList());
 if (arguments.size() != 1) {
 //TODO logger, ignoring because yy
 }
 title.setImagePath(arguments.get(0));
 }
 }
 private static class PageNumberConsumer implements BiConsumer<Title, Stream<String>> {
 @Override
 public void accept(final Title title, final Stream<String> stream) {
 Objects.requireNonNull(title);
 Objects.requireNonNull(stream);
 List<Integer> arguments = stream.map(Integer::parseInt).collect(Collectors.toList());
 if (arguments.size() != 1) {
 //TODO logger, ignoring because yyy
 }
 title.setPageNumber(arguments.get(0));
 }
 }
}

public class Page extends FileElement<FileElement.Void, Area> {
 private final Path imagePath;
 private final int pageNumber;
 public static Page of(final Element element) {
 Objects.requireNonNull(element);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 return new Page(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
 title.getImagePath().orElseThrow(() -> new IllegalStateException("No image path present in: " + elementTitle)),
 title.getPageNumber().orElseThrow(() -> new IllegalStateException("No page number present in: " + elementTitle))
 );
 }
 public Page(final String id, final BoundingBox boundingBox, final Path imagePath, final int pageNumber) {
 super(id, boundingBox);
 this.imagePath = Objects.requireNonNull(imagePath);
 this.pageNumber = Objects.requireNonNull(pageNumber);
 }
 @Override
 public FileElement.Void getParent() {
 return null;
 }
 @Override
 public void setParent(final FileElement.Void parent) {
 throw new IllegalStateException("The Page element has no parent.");
 }
 public Path getImagePath() {
 return imagePath;
 }
 public int getPageNumber() {
 return pageNumber;
 }
}

public class Area extends FileElement<Page, Paragraph> {
 public Area(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Area of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Area::new);
 }
}

public class Paragraph extends FileElement<Area, Line> {
 public Paragraph(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Paragraph of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Paragraph::new);
 }
}

public class Line extends FileElement<Paragraph, Word> {
 public Line(final String id, final BoundingBox boundingBox) {
 super(id, boundingBox);
 }
 public static final Line of(final Element element) {
 return FileElement.ofBoundingBoxElement(Objects.requireNonNull(element), Line::new);
 }
}

public class Word extends FileElement<Line, FileElement.Void> {
 private final String content;
 private final boolean strong;
 public Word(final String id, final BoundingBox boundingBox, final String content, final boolean strong) {
 super(id, boundingBox);
 this.content = Objects.requireNonNull(content);
 this.strong = strong;
 }
 public static final Word of(final Element element) {
 Objects.requireNonNull(element);
 String elementId = element.getAttributeValue("id");
 String elementTitle = element.getAttributeValue("title");
 Title title = new Title(elementTitle);
 Element child = element.getChild("strong");
 return new Word(
 elementId,
 title.getBoundingBox().orElseThrow(() -> new IllegalStateException("No bounding box present in: " + elementTitle)),
 child == null ? element.getText() : child.getText(),
 child != null
 );
 }
 @Override
 public Stream<Void> getChildren() {
 return Stream.empty();
 }
 @Override
 public void addChild(final Void child) {
 throw new IllegalStateException("The Word element has no children.");
 }
 public String getContent() {
 return content;
 }
 public boolean isStrong() {
 return strong;
 }
}

The usage:

Note that the ScannedFile is not complete (but working) yet, I will put convienience methods in there once I need them.

public class ScannedFile implements Serializable {
 private static final long serialVersionUID = 859948374926589L;
 private final List<Page> pages = new ArrayList<>();
 public void addChild(final Page page) {
 pages.add(page);
 }
 public Stream<Page> getPages() {
 return pages.stream();
 }
}

The actual XML reading, which has some code smell in it:

public static ScannedFile createScannedFileFromHOCR(final Path directory) {
 try {
 Objects.requireNonNull(directory);
 Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
 SAXBuilder xmlBuilder = new SAXBuilder();
 Document document = xmlBuilder.build(htmlFile.toFile());
 ScannedFile scannedFile = new ScannedFile();
 XPathFactory factory = XPathFactory.instance();
 System.out.println("factory = " + factory);
 XPathExpression<Element> xpePages = factory.compile( //arguments are untyped?
 "/h:html/h:body/h:div[@class='ocr_page']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 System.out.println("xpePages = " + xpePages);
 List<Element> pages = xpePages.evaluate(document);
 System.out.println("pages = " + pages);
 for (Element pageElement : pages) {
 Page page = Page.of(pageElement);
 XPathExpression<Element> xpeAreas = factory.compile(
 "h:div[@class='ocr_carea']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> areas = xpeAreas.evaluate(pageElement);
 for (Element areaElement : areas) {
 Area area = Area.of(areaElement);
 area.setParent(page);
 page.addChild(area);
 XPathExpression<Element> xpeParagraphs = factory.compile(
 "h:p[@class='ocr_par']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> paragraphs = xpeParagraphs.evaluate(areaElement);
 for (Element paragraphElement : paragraphs) {
 Paragraph paragraph = Paragraph.of(paragraphElement);
 paragraph.setParent(area);
 area.addChild(paragraph);
 XPathExpression<Element> xpeLines = factory.compile(
 "h:span[@class='ocr_line']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> lines = xpeLines.evaluate(paragraphElement);
 for (Element lineElement : lines) {
 Line line = Line.of(lineElement);
 line.setParent(paragraph);
 paragraph.addChild(line);
 XPathExpression<Element> xpeWords = factory.compile(
 "h:span[@class='ocrx_word']",
 Filters.element(),
 null,
 Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml")
 );
 List<Element> words = xpeWords.evaluate(lineElement);
 for (Element wordElement : words) {
 Word word = Word.of(wordElement);
 word.setParent(line);
 line.addChild(word);
 }
 }
 }
 }
 scannedFile.addChild(page);
 }
 return scannedFile;
 } catch (JDOMException | IOException ex) {
 ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
 throw new AutomaticExecutionFailedException(ex);
 }
}

An example usage:

ScannedFile scannedFile = OCRUtils.createScannedFileFromHOCR(path);
System.out.println("Created ScannedFile");
long elementCount = scannedFile.getPages()
 .flatMap(Page::getChildren)
 .flatMap(Area::getChildren)
 .flatMap(Paragraph::getChildren)
 .flatMap(Line::getChildren)
 .count();
List<String> wordList = scannedFile.getPages()
 .flatMap(Page::getChildren)
 .flatMap(Area::getChildren)
 .flatMap(Paragraph::getChildren)
 .flatMap(Line::getChildren)
 .map(Word::getContent)
 .collect(Collectors.toList());
System.out.println("Word count: " + elementCount);
System.out.println("Word list: " + wordList);

Question 2

Looking at just the JDOM portion, there are a couple of tricks you can play.

Unfortunately, XML and streams will always be an uncomfortable mix...

The following factors in JDOM make better performance possible:

DRY
- XPathExpressions are threadsafe. Reuse them
- Namespaces are threadsafe, and immutable. Reuse them
- SAXBuilder has a relatively slow setup time (it needs to query installed XML parsers, etc.). JDOM can reuse a single configuration using a SAXEngine....

It is great that you are using relative XPath queries, and using the smaller context for them. This makes a big performance difference.

This is about as neat as I have seen this work done. It is good. I am not sure you will appreciate it if I were to suggest this is really good, and that the alternative tools/libraries (other than JDOM) would be much harder to get right.

Putting these things together, your createScannedFileFromHOCR method can become the following collection of statics (fully thread-safe, and better performing):

private static final XPathFactory factory = XPathFactory.instance();
private static final Namespace XHTML = Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml");
private static final XPathExpression<Element> OCR_PAGE = factory.compile( //arguments are untyped?
 "/h:html/h:body/h:div[@class='ocr_page']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_CAREA = factory.compile(
 "h:div[@class='ocr_carea']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_PAR = factory.compile(
 "h:p[@class='ocr_par']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_LINE = factory.compile(
 "h:span[@class='ocr_line']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCRX_WORD = factory.compile(
 "h:span[@class='ocrx_word']",
 Filters.element(),
 null,
 XHTML
 );
private static final SAXBuilder XMLBUILDER = new SAXBuilder();
private static final ThreadLocal<SAXEngine> XMLENGINE = new ThreadLocal<SAXEngine>() {
 @Override
 protected SAXEngine initialValue() {
 try {
 return XMLBUILDER.buildEngine();
 } catch (JDOMException e) {
 throw new IllegalStateException("Unable to build Engine", e);
 }
 }
};
public static ScannedFile createScannedFileFromHOCR(final Path directory) {
 try {
 Objects.requireNonNull(directory);
 Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
 Document document = XMLENGINE.get().build(htmlFile.toFile());
 ScannedFile scannedFile = new ScannedFile();
 System.out.println("factory = " + factory);
 System.out.println("xpePages = " + OCR_PAGE);
 List<Element> pages = OCR_PAGE.evaluate(document);
 System.out.println("pages = " + pages);
 for (Element pageElement : pages) {
 Page page = Page.of(pageElement);
 List<Element> areas = OCR_CAREA.evaluate(pageElement);
 for (Element areaElement : areas) {
 Area area = Area.of(areaElement);
 area.setParent(page);
 page.addChild(area);
 List<Element> paragraphs = OCR_PAR.evaluate(areaElement);
 for (Element paragraphElement : paragraphs) {
 Paragraph paragraph = Paragraph.of(paragraphElement);
 paragraph.setParent(area);
 area.addChild(paragraph);
 List<Element> lines = OCR_LINE.evaluate(paragraphElement);
 for (Element lineElement : lines) {
 Line line = Line.of(lineElement);
 line.setParent(paragraph);
 paragraph.addChild(line);
 List<Element> words = OCRX_WORD.evaluate(lineElement);
 for (Element wordElement : words) {
 Word word = Word.of(wordElement);
 word.setParent(line);
 line.addChild(word);
 }
 }
 }
 }
 scannedFile.addChild(page);
 }
 return scannedFile;
 } catch (JDOMException | IOException ex) {
 ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
 throw new AutomaticExecutionFailedException(ex);
 }
}

rolfl rolfl 98.1k17 gold badges219 silver badges419 bronze badges · Accepted Answer · 2014-04-30 12:19:27Z

Looking at just the JDOM portion, there are a couple of tricks you can play.

Unfortunately, XML and streams will always be an uncomfortable mix...

The following factors in JDOM make better performance possible:

DRY
- XPathExpressions are threadsafe. Reuse them
- Namespaces are threadsafe, and immutable. Reuse them
- SAXBuilder has a relatively slow setup time (it needs to query installed XML parsers, etc.). JDOM can reuse a single configuration using a SAXEngine....

It is great that you are using relative XPath queries, and using the smaller context for them. This makes a big performance difference.

This is about as neat as I have seen this work done. It is good. I am not sure you will appreciate it if I were to suggest this is really good, and that the alternative tools/libraries (other than JDOM) would be much harder to get right.

Putting these things together, your createScannedFileFromHOCR method can become the following collection of statics (fully thread-safe, and better performing):

private static final XPathFactory factory = XPathFactory.instance();
private static final Namespace XHTML = Namespace.getNamespace("h", "http://www.w3.org/1999/xhtml");
private static final XPathExpression<Element> OCR_PAGE = factory.compile( //arguments are untyped?
 "/h:html/h:body/h:div[@class='ocr_page']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_CAREA = factory.compile(
 "h:div[@class='ocr_carea']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_PAR = factory.compile(
 "h:p[@class='ocr_par']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCR_LINE = factory.compile(
 "h:span[@class='ocr_line']",
 Filters.element(),
 null,
 XHTML
 );
private static final XPathExpression<Element> OCRX_WORD = factory.compile(
 "h:span[@class='ocrx_word']",
 Filters.element(),
 null,
 XHTML
 );
private static final SAXBuilder XMLBUILDER = new SAXBuilder();
private static final ThreadLocal<SAXEngine> XMLENGINE = new ThreadLocal<SAXEngine>() {
 @Override
 protected SAXEngine initialValue() {
 try {
 return XMLBUILDER.buildEngine();
 } catch (JDOMException e) {
 throw new IllegalStateException("Unable to build Engine", e);
 }
 }
};
public static ScannedFile createScannedFileFromHOCR(final Path directory) {
 try {
 Objects.requireNonNull(directory);
 Path htmlFile = FileUtils.getSingleFileWithExtensionInDirectory(directory, Extension.HTML);
 Document document = XMLENGINE.get().build(htmlFile.toFile());
 ScannedFile scannedFile = new ScannedFile();
 System.out.println("factory = " + factory);
 System.out.println("xpePages = " + OCR_PAGE);
 List<Element> pages = OCR_PAGE.evaluate(document);
 System.out.println("pages = " + pages);
 for (Element pageElement : pages) {
 Page page = Page.of(pageElement);
 List<Element> areas = OCR_CAREA.evaluate(pageElement);
 for (Element areaElement : areas) {
 Area area = Area.of(areaElement);
 area.setParent(page);
 page.addChild(area);
 List<Element> paragraphs = OCR_PAR.evaluate(areaElement);
 for (Element paragraphElement : paragraphs) {
 Paragraph paragraph = Paragraph.of(paragraphElement);
 paragraph.setParent(area);
 area.addChild(paragraph);
 List<Element> lines = OCR_LINE.evaluate(paragraphElement);
 for (Element lineElement : lines) {
 Line line = Line.of(lineElement);
 line.setParent(paragraph);
 paragraph.addChild(line);
 List<Element> words = OCRX_WORD.evaluate(lineElement);
 for (Element wordElement : words) {
 Word word = Word.of(wordElement);
 word.setParent(line);
 line.addChild(word);
 }
 }
 }
 }
 scannedFile.addChild(page);
 }
 return scannedFile;
 } catch (JDOMException | IOException ex) {
 ManualUtils.moveToManual(Base.MANUAL_DIRECTORY, directory);
 throw new AutomaticExecutionFailedException(ex);
 }
}

Stack Exchange Network

Saving an XML file created by Tesseract OCR as a traversable object

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Saving an XML file created by Tesseract OCR as a traversable object

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions