Intro
(This post has a continuation at Get histogram of bytes in any set of files in Java - take II.)
This time I decided to rewrite the byte histogram counters in C++ [1][2][3] to Java. Usage is the same: if no arguments is supplied, the program reads from the standard input. Otherwise, it attempts to treat all the arguments as file names and attempts to count bytes in them.
Code
com.github.coderodde.file.util.ByteHistogram.java:
package com.github.coderodde.file.util;
/**
* This class implements the byte histogram.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class ByteHistogram {
private static final int HISTOGRAM_CAPACITY = 256;
private static final int SCREEN_WIDTH = 80;
private static final int LINE_PREAMBLE_WIDTH = 11;
private final long[] data = new long[HISTOGRAM_CAPACITY];
/**
* Account the byte {@code b}.
*
* @param b the byte to account.
*/
public void insert(final byte b) {
data[Byte.toUnsignedInt(b)]++;
}
/**
* Converts this byte histogram to an ASCII art.
*
* @return ASCII art version of this byte histogram.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
final long maximumCount = computeMaximumCount();
final int countStringLength =
computeCountStringLength(maximumCount);
final String lineFormat = getLineFormat(countStringLength);
for (int i = 0; i < data.length; i++) {
loadLine(sb,
lineFormat,
countStringLength,
i,
maximumCount);
}
return sb.toString();
}
/**
* Builds the format for printing the lines in the output.
*
* @param countStringLength the count string length in characters.
*
* @return the format for printing the lines in the output.
*/
private static String getLineFormat(final int countStringLength) {
return String.format("0x%%02x [%%c]: %% %dd %%s\n", countStringLength);
}
/**
* Loads a single line to the total output of this byte histogram.
*
* @param sb the string builder.
* @param lineFormat the format of the line.
* @param countStringLength the length of the count string.
* @param index the byte index.
* @param maximumCount the maximum count in the histogram.
*/
private void loadLine(final StringBuilder sb,
final String lineFormat,
final int countStringLength,
final int index,
final long maximumCount) {
sb.append(
String.format(
lineFormat,index,
Character.isLetterOrDigit((char) index) ?
(char) index :
'?',
data[index],
computeBarAscii(data[index],
maximumCount,
countStringLength)));
}
/**
* Computes and returns the bar ASCII art.
*
* @param count the count of the line we are processing.
* @param maximumCount the maximum count in the byte histogram.
* @param countStringLength the count string length.
*
* @return the bar ASCII art.
*/
private static String computeBarAscii(final long count,
final long maximumCount,
final int countStringLength) {
final float ratio = ((float) count) / ((float) maximumCount);
final int maximumBarLength = SCREEN_WIDTH
- LINE_PREAMBLE_WIDTH
- countStringLength;
final int barLength = (int)(ratio * maximumBarLength);
final StringBuilder sb = new StringBuilder(barLength);
for (int i = 0; i < barLength; i++) {
sb.append("*");
}
return sb.toString();
}
/**
* Computes the maximum count in this byte histogram.
*
* @return the maximum count.
*/
private long computeMaximumCount() {
long m = 0L;
for (final long count : data) {
m = Math.max(m, count);
}
return m;
}
/**
* Computes and returns the length of the widest length string.
*
* @param maximumLength the maximum length of the byte histogram.
*
* @return the widest length of the count string in characters.
*/
private static int computeCountStringLength(final long maximumLength) {
return Long.toString(maximumLength).length();
}
}
com.github.coderodde.file.util.ByteHistogramApp.java:
package com.github.coderodde.file.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
/**
* This class implements a program for counting byte histograms in files.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class ByteHistogramApp {
public static void main(String[] args) {
List<InputStream> inputStreamList = null;
// Prepare the input streams from which to build the (shared) byte
// histogram:
try {
inputStreamList = getInputStreams(args);
} catch (final MultipleFileNotFoundException ex) {
ex.getExceptionList()
.forEach((e) -> System.err.println(e.getMessage()));
System.exit(-1);
}
// Once here, we have valid input streams. Request the histogram and
// print it in the console:
try {
System.out.println(processInputStreamList(inputStreamList));
} catch (MultipleIOException ex) {
ex.getExceptionList()
.forEach((e) -> System.err.println(e.getMessage()));
System.exit(-2);
}
}
/**
* Converts the input argument list to the list of input streams.
*
* @param args the names of the files to process.
*
* @return the input stream list.
*
* @throws MultipleFileNotFoundException if any file failed.
*/
private static List<InputStream> getInputStreams(final String[] args)
throws MultipleFileNotFoundException {
if (args.length == 0) {
return List.of(System.in);
}
final List<InputStream> inputStreamList = new ArrayList<>(args.length);
final MultipleFileNotFoundException exceptionList =
new MultipleFileNotFoundException();
for (final String fileName : args) {
try {
final InputStream is = new FileInputStream(new File(fileName));
inputStreamList.add(is);;
} catch (final FileNotFoundException ex) {
// Add the exception ex to the exceptionList:
exceptionList.add(ex);
}
}
if (!exceptionList.isEmpty()) {
// Once here, something went wrong. Throw:
throw exceptionList;
}
return inputStreamList;
}
/**
* Builds the shared histogram from the input streams in the argument.
*
* @param inputStreamList the list of input stream supplying the bytes.
*
* @return the shared byte histogram.
*
* @throws MultipleIOException if any stream threw.
*/
private static ByteHistogram
processInputStreamList(final List<InputStream> inputStreamList)
throws MultipleIOException {
final ByteHistogram histogram = new ByteHistogram();
final MultipleIOException ex = new MultipleIOException();
for (final InputStream is : inputStreamList) {
try {
processInputStream(is, histogram);
} catch (final IOException e) {
// Add the new I/O exception e to ex::
ex.add(e);
}
}
if (!ex.isEmpty()) {
// Once here, something went wrong. Throw:
throw ex;
}
return histogram;
}
/**
* Processes the input stream reading bytes from it until end of file is
* reached.
*
* @param is the input stream.
* @param histogram the target histogram.
*
* @throws IOException if I/O fails.
*/
private static void processInputStream(final InputStream is,
final ByteHistogram histogram)
throws IOException {
int i;
while ((i = is.read()) != -1) {
histogram.insert((byte) i);
}
}
}
com.github.coderodde.file.util.MultipleFileNotFoundException.java:
package com.github.coderodde.file.util;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
/**
* This class holds a list of actual exception object of type
* {@link java.io.FileNotFoundException}.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class MultipleFileNotFoundException extends Exception {
private final List<FileNotFoundException> exceptionList = new ArrayList<>();
public void add(final FileNotFoundException ex) {
exceptionList.add(
Objects.requireNonNull(
ex,
"The input exception is null."));
}
public boolean isEmpty() {
return exceptionList.isEmpty();
}
public Collection<FileNotFoundException> getExceptionList() {
return Collections.unmodifiableCollection(exceptionList);
}
}
com.github.coderodde.file.util.MultipleIOException.java:
package com.github.coderodde.file.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
/**
* This class holds a list of actual exception object of type
* {@link java.io.IOException}.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class MultipleIOException extends Exception {
private final List<IOException> exceptionList = new ArrayList<>();
public void add(final IOException ex) {
exceptionList.add(
Objects.requireNonNull(
ex,
"The input exception is null."));
}
public boolean isEmpty() {
return exceptionList.isEmpty();
}
public Collection<IOException> getExceptionList() {
return Collections.unmodifiableCollection(exceptionList);
}
}
Typical output
C:\Users\rodio\OneDrive\Documents\NetBeansProjects\ByteHistogram.java\target\classes>echo hello world | java com.github.
coderodde.file.util.ByteHistogramApp
0x00 [?]: 0
0x01 [?]: 0
0x02 [?]: 0
0x03 [?]: 0
0x04 [?]: 0
0x05 [?]: 0
0x06 [?]: 0
0x07 [?]: 0
0x08 [?]: 0
0x09 [?]: 0
0x0a [?]: 1 **********************
0x0b [?]: 0
0x0c [?]: 0
0x0d [?]: 1 **********************
0x0e [?]: 0
0x0f [?]: 0
0x10 [?]: 0
0x11 [?]: 0
0x12 [?]: 0
0x13 [?]: 0
0x14 [?]: 0
0x15 [?]: 0
0x16 [?]: 0
0x17 [?]: 0
0x18 [?]: 0
0x19 [?]: 0
0x1a [?]: 0
0x1b [?]: 0
0x1c [?]: 0
0x1d [?]: 0
0x1e [?]: 0
0x1f [?]: 0
0x20 [?]: 2 *********************************************
0x21 [?]: 0
0x22 [?]: 0
0x23 [?]: 0
0x24 [?]: 0
0x25 [?]: 0
0x26 [?]: 0
0x27 [?]: 0
0x28 [?]: 0
0x29 [?]: 0
0x2a [?]: 0
0x2b [?]: 0
0x2c [?]: 0
0x2d [?]: 0
0x2e [?]: 0
0x2f [?]: 0
0x30 [0]: 0
0x31 [1]: 0
0x32 [2]: 0
0x33 [3]: 0
0x34 [4]: 0
0x35 [5]: 0
0x36 [6]: 0
0x37 [7]: 0
0x38 [8]: 0
0x39 [9]: 0
0x3a [?]: 0
0x3b [?]: 0
0x3c [?]: 0
0x3d [?]: 0
0x3e [?]: 0
0x3f [?]: 0
0x40 [?]: 0
0x41 [A]: 0
0x42 [B]: 0
0x43 [C]: 0
0x44 [D]: 0
0x45 [E]: 0
0x46 [F]: 0
0x47 [G]: 0
0x48 [H]: 0
0x49 [I]: 0
0x4a [J]: 0
0x4b [K]: 0
0x4c [L]: 0
0x4d [M]: 0
0x4e [N]: 0
0x4f [O]: 0
0x50 [P]: 0
0x51 [Q]: 0
0x52 [R]: 0
0x53 [S]: 0
0x54 [T]: 0
0x55 [U]: 0
0x56 [V]: 0
0x57 [W]: 0
0x58 [X]: 0
0x59 [Y]: 0
0x5a [Z]: 0
0x5b [?]: 0
0x5c [?]: 0
0x5d [?]: 0
0x5e [?]: 0
0x5f [?]: 0
0x60 [?]: 0
0x61 [a]: 0
0x62 [b]: 0
0x63 [c]: 0
0x64 [d]: 1 **********************
0x65 [e]: 1 **********************
0x66 [f]: 0
0x67 [g]: 0
0x68 [h]: 1 **********************
0x69 [i]: 0
0x6a [j]: 0
0x6b [k]: 0
0x6c [l]: 3 ********************************************************************
0x6d [m]: 0
0x6e [n]: 0
0x6f [o]: 2 *********************************************
0x70 [p]: 0
0x71 [q]: 0
0x72 [r]: 1 **********************
0x73 [s]: 0
0x74 [t]: 0
0x75 [u]: 0
0x76 [v]: 0
0x77 [w]: 1 **********************
0x78 [x]: 0
0x79 [y]: 0
0x7a [z]: 0
0x7b [?]: 0
0x7c [?]: 0
0x7d [?]: 0
0x7e [?]: 0
0x7f [?]: 0
0x80 [?]: 0
0x81 [?]: 0
0x82 [?]: 0
0x83 [?]: 0
0x84 [?]: 0
0x85 [?]: 0
0x86 [?]: 0
0x87 [?]: 0
0x88 [?]: 0
0x89 [?]: 0
0x8a [?]: 0
0x8b [?]: 0
0x8c [?]: 0
0x8d [?]: 0
0x8e [?]: 0
0x8f [?]: 0
0x90 [?]: 0
0x91 [?]: 0
0x92 [?]: 0
0x93 [?]: 0
0x94 [?]: 0
0x95 [?]: 0
0x96 [?]: 0
0x97 [?]: 0
0x98 [?]: 0
0x99 [?]: 0
0x9a [?]: 0
0x9b [?]: 0
0x9c [?]: 0
0x9d [?]: 0
0x9e [?]: 0
0x9f [?]: 0
0xa0 [?]: 0
0xa1 [?]: 0
0xa2 [?]: 0
0xa3 [?]: 0
0xa4 [?]: 0
0xa5 [?]: 0
0xa6 [?]: 0
0xa7 [?]: 0
0xa8 [?]: 0
0xa9 [?]: 0
0xaa [a]: 0
0xab [?]: 0
0xac [?]: 0
0xad [?]: 0
0xae [?]: 0
0xaf [?]: 0
0xb0 [?]: 0
0xb1 [?]: 0
0xb2 [?]: 0
0xb3 [?]: 0
0xb4 [?]: 0
0xb5 [μ]: 0
0xb6 [?]: 0
0xb7 [?]: 0
0xb8 [?]: 0
0xb9 [?]: 0
0xba [o]: 0
0xbb [?]: 0
0xbc [?]: 0
0xbd [?]: 0
0xbe [?]: 0
0xbf [?]: 0
0xc0 [À]: 0
0xc1 [Á]: 0
0xc2 [Â]: 0
0xc3 [Ã]: 0
0xc4 [Ä]: 0
0xc5 [Å]: 0
0xc6 [Æ]: 0
0xc7 [Ç]: 0
0xc8 [È]: 0
0xc9 [É]: 0
0xca [Ê]: 0
0xcb [Ë]: 0
0xcc [Ì]: 0
0xcd [Í]: 0
0xce [Î]: 0
0xcf [Ï]: 0
0xd0 [Ð]: 0
0xd1 [Ñ]: 0
0xd2 [Ò]: 0
0xd3 [Ó]: 0
0xd4 [Ô]: 0
0xd5 [Õ]: 0
0xd6 [Ö]: 0
0xd7 [?]: 0
0xd8 [Ø]: 0
0xd9 [Ù]: 0
0xda [Ú]: 0
0xdb [Û]: 0
0xdc [Ü]: 0
0xdd [Ý]: 0
0xde [Þ]: 0
0xdf [ß]: 0
0xe0 [à]: 0
0xe1 [á]: 0
0xe2 [â]: 0
0xe3 [ã]: 0
0xe4 [ä]: 0
0xe5 [å]: 0
0xe6 [æ]: 0
0xe7 [ç]: 0
0xe8 [è]: 0
0xe9 [é]: 0
0xea [ê]: 0
0xeb [ë]: 0
0xec [ì]: 0
0xed [í]: 0
0xee [î]: 0
0xef [ï]: 0
0xf0 [ð]: 0
0xf1 [ñ]: 0
0xf2 [ò]: 0
0xf3 [ó]: 0
0xf4 [ô]: 0
0xf5 [õ]: 0
0xf6 [ö]: 0
0xf7 [?]: 0
0xf8 [ø]: 0
0xf9 [ù]: 0
0xfa [ú]: 0
0xfb [û]: 0
0xfc [ü]: 0
0xfd [ý]: 0
0xfe [þ]: 0
0xff [ÿ]: 0
Critique request
I would love to hear any commentary regarding my attempt.
References
[1] Get histogram of bytes in any set of files in C++14
[2] Get histogram of bytes in any set of files in C++14 - take II
1 Answer 1
Resource leaks
You should always close an InputStream
that reads data from a file system.
Use try-with-resources for that purpose.
And you should always release system resources as soon as possible, i.e. close the stream immediately after you've read all required data from it. For that reason, the idea of opening a bunch of streams (your method getInputStreams()
) and then tossing them around is not a nice one.
Buffering
You're reading files byte by byte, triggering disc access at each FileInputStream.read()
invocation. Which is very slow.
Instead, you can leverage bufferization by wrapping each FileInputStream
with BufferedInputStream
, which internally maintains a buffer of 8192
bytes. This way, you can still read byte by byte from the buffer with int read()
.
Or alternatively, use another flavor of read(byte b[])
method which expects a byte array as a parameter (i.e. you will have to create and provide a byte buffer manually).
ByteHistogram
You might want to introduce method insertUnsigned(int)
because currently you're doing you doing two redundant primitive conversions:
int read()
returns byte value represented as an unsignedint
;- then you're casting it into a
byte
; - and then you're passing it into
Byte.toUnsignedInt()
to get the sameint
you've read from the stream.
isLetterOrDigit
isn't nearly the set of useful printable characters. Consider printing everything for whichisISOControl
returns false. \$\endgroup\$