Intro
This post is the continuation of Get histogram of bytes in any set of files in Java. This time, I have incorporated all (?) the suggestions provided by Alexander Ivanchenko.
The entire repository is on my GitHub.
Code
com.github.coderodde.file.util.ByteHistogramApp.java:
package com.github.coderodde.file.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
/**
* This class implements a program for counting byte histograms in files.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class ByteHistogramApp {
public static void main(String[] args) {
List<InputStream> inputStreamList = null;
// Prepare the input streams from which to build the (shared) byte
// histogram:
try {
inputStreamList = getInputStreams(args);
} catch (final PairException ex) {
ex.getFileNotFoundException()
.getExceptionList()
.forEach((e) -> System.err.println(e.getMessage()));
ex.getIOException()
.getExceptionList()
.forEach((e) -> System.err.println(e.getMessage()));
System.exit(-1);
}
// Once here, we have valid input streams. Request the histogram and
// print it in the console:
try {
System.out.println(processInputStreamList(inputStreamList));
} catch (MultipleIOException ex) {
ex.getExceptionList()
.forEach((e) -> System.err.println(e.getMessage()));
System.exit(-2);
}
}
/**
* Converts the input argument list to the list of input streams.
*
* @param args the names of the files to process.
*
* @return the input stream list.
*
* @throws MultipleFileNotFoundException if any file failed.
*/
private static List<InputStream> getInputStreams(final String[] args)
throws PairException {
if (args.length == 0) {
return List.of(System.in);
}
final List<InputStream> inputStreamList = new ArrayList<>(args.length);
final MultipleFileNotFoundException exceptionListFileNotFound =
new MultipleFileNotFoundException();
final MultipleIOException exceptionListIO =
new MultipleIOException();
for (final String fileName : args) {
try {
final InputStream is = new FileInputStream(new File(fileName));
inputStreamList.add(is);
} catch (final FileNotFoundException ex) {
// Add the exception ex to the exceptionList:
exceptionListFileNotFound.add(ex);
for (final InputStream is : inputStreamList) {
try {
is.close();
} catch (final IOException ioException) {
exceptionListIO.add(ioException);
}
}
}
}
if (!exceptionListIO.isEmpty() ||
!exceptionListFileNotFound.isEmpty()) {
// Once here, something went wrong. Throw:
throw new PairException(exceptionListFileNotFound,
exceptionListIO);
}
return inputStreamList;
}
/**
* Builds the shared histogram from the input streams in the argument.
*
* @param inputStreamList the list of input stream supplying the bytes.
*
* @return the shared byte histogram.
*
* @throws MultipleIOException if any stream threw.
*/
private static ByteHistogram
processInputStreamList(final List<InputStream> inputStreamList)
throws MultipleIOException {
final ByteHistogram histogram = new ByteHistogram();
final MultipleIOException ex = new MultipleIOException();
for (final InputStream is : inputStreamList) {
try {
processInputStream(
new BufferedInputStream(is),
histogram);
} catch (final IOException e) {
// Add the new I/O exception e to ex::
ex.add(e);
}
}
if (!ex.isEmpty()) {
// Once here, something went wrong. Throw:
throw ex;
}
return histogram;
}
/**
* Processes the input stream reading bytes from it until end of file is
* reached.
*
* @param is the input stream.
* @param histogram the target histogram.
*
* @throws IOException if I/O fails.
*/
private static void processInputStream(final InputStream is,
final ByteHistogram histogram)
throws IOException {
int i;
while ((i = is.read()) != -1) {
histogram.insert(i);
}
is.close();
}
}
com.github.coderodde.file.util.ByteHistogram.java:
package com.github.coderodde.file.util;
/**
* This class implements the byte histogram.
*
* @version 1.0.0 (Nov 13, 2024)
* @since 1.0.0 (Nov 13, 2024)
*/
public final class ByteHistogram {
private static final int HISTOGRAM_CAPACITY = 256;
private static final int SCREEN_WIDTH = 80;
private static final int LINE_PREAMBLE_WIDTH = 11;
private final long[] data = new long[HISTOGRAM_CAPACITY];
/**
* Account the byte {@code b}.
*
* @param b the byte to account.
*/
public void insert(final int b) {
data[b]++;
}
/**
* Converts this byte histogram to an ASCII art.
*
* @return ASCII art version of this byte histogram.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
final long maximumCount = computeMaximumCount();
final int countStringLength =
computeCountStringLength(maximumCount);
final String lineFormat = getLineFormat(countStringLength);
for (int i = 0; i < data.length; i++) {
loadLine(sb,
lineFormat,
countStringLength,
i,
maximumCount);
}
return sb.toString();
}
/**
* Builds the format for printing the lines in the output.
*
* @param countStringLength the count string length in characters.
*
* @return the format for printing the lines in the output.
*/
private static String getLineFormat(final int countStringLength) {
return String.format("0x%%02x [%%c]: %% %dd %%s\n", countStringLength);
}
/**
* Loads a single line to the total output of this byte histogram.
*
* @param sb the string builder.
* @param lineFormat the format of the line.
* @param countStringLength the length of the count string.
* @param index the byte index.
* @param maximumCount the maximum count in the histogram.
*/
private void loadLine(final StringBuilder sb,
final String lineFormat,
final int countStringLength,
final int index,
final long maximumCount) {
sb.append(
String.format(
lineFormat,
index,
!Character.isISOControl((char) index) ?
(char) index :
'?',
data[index],
computeBarAscii(data[index],
maximumCount,
countStringLength)));
}
/**
* Computes and returns the bar ASCII art.
*
* @param count the count of the line we are processing.
* @param maximumCount the maximum count in the byte histogram.
* @param countStringLength the count string length.
*
* @return the bar ASCII art.
*/
private static String computeBarAscii(final long count,
final long maximumCount,
final int countStringLength) {
final float ratio = ((float) count) / ((float) maximumCount);
final int maximumBarLength = SCREEN_WIDTH
- LINE_PREAMBLE_WIDTH
- countStringLength;
final int barLength = (int)(ratio * maximumBarLength);
final StringBuilder sb = new StringBuilder(barLength);
for (int i = 0; i < barLength; i++) {
sb.append("*");
}
return sb.toString();
}
/**
* Computes the maximum count in this byte histogram.
*
* @return the maximum count.
*/
private long computeMaximumCount() {
long m = 0L;
for (final long count : data) {
m = Math.max(m, count);
}
return m;
}
/**
* Computes and returns the length of the widest length string.
*
* @param maximumLength the maximum length of the byte histogram.
*
* @return the widest length of the count string in characters.
*/
private static int computeCountStringLength(final long maximumLength) {
return Long.toString(maximumLength).length();
}
}
Typical output
C:\Users\rodio\OneDrive\Documents\NetBeansProjects\ByteHistogram.java\target\classes>echo Hello World | java com.github.coderodde.file.util.ByteHistogramApp
0x00 [?]: 0
0x01 [?]: 0
0x02 [?]: 0
0x03 [?]: 0
0x04 [?]: 0
0x05 [?]: 0
0x06 [?]: 0
0x07 [?]: 0
0x08 [?]: 0
0x09 [?]: 0
0x0a [?]: 1 **********************
0x0b [?]: 0
0x0c [?]: 0
0x0d [?]: 1 **********************
0x0e [?]: 0
0x0f [?]: 0
0x10 [?]: 0
0x11 [?]: 0
0x12 [?]: 0
0x13 [?]: 0
0x14 [?]: 0
0x15 [?]: 0
0x16 [?]: 0
0x17 [?]: 0
0x18 [?]: 0
0x19 [?]: 0
0x1a [?]: 0
0x1b [?]: 0
0x1c [?]: 0
0x1d [?]: 0
0x1e [?]: 0
0x1f [?]: 0
0x20 [ ]: 2 *********************************************
0x21 [!]: 0
0x22 ["]: 0
0x23 [#]: 0
0x24 [$]: 0
0x25 [%]: 0
0x26 [&]: 0
0x27 [']: 0
0x28 [(]: 0
0x29 [)]: 0
0x2a [*]: 0
0x2b [+]: 0
0x2c [,]: 0
0x2d [-]: 0
0x2e [.]: 0
0x2f [/]: 0
0x30 [0]: 0
0x31 [1]: 0
0x32 [2]: 0
0x33 [3]: 0
0x34 [4]: 0
0x35 [5]: 0
0x36 [6]: 0
0x37 [7]: 0
0x38 [8]: 0
0x39 [9]: 0
0x3a [:]: 0
0x3b [;]: 0
0x3c [<]: 0
0x3d [=]: 0
0x3e [>]: 0
0x3f [?]: 0
0x40 [@]: 0
0x41 [A]: 0
0x42 [B]: 0
0x43 [C]: 0
0x44 [D]: 0
0x45 [E]: 0
0x46 [F]: 0
0x47 [G]: 0
0x48 [H]: 1 **********************
0x49 [I]: 0
0x4a [J]: 0
0x4b [K]: 0
0x4c [L]: 0
0x4d [M]: 0
0x4e [N]: 0
0x4f [O]: 0
0x50 [P]: 0
0x51 [Q]: 0
0x52 [R]: 0
0x53 [S]: 0
0x54 [T]: 0
0x55 [U]: 0
0x56 [V]: 0
0x57 [W]: 1 **********************
0x58 [X]: 0
0x59 [Y]: 0
0x5a [Z]: 0
0x5b [[]: 0
0x5c [\]: 0
0x5d []]: 0
0x5e [^]: 0
0x5f [_]: 0
0x60 [`]: 0
0x61 [a]: 0
0x62 [b]: 0
0x63 [c]: 0
0x64 [d]: 1 **********************
0x65 [e]: 1 **********************
0x66 [f]: 0
0x67 [g]: 0
0x68 [h]: 0
0x69 [i]: 0
0x6a [j]: 0
0x6b [k]: 0
0x6c [l]: 3 ********************************************************************
0x6d [m]: 0
0x6e [n]: 0
0x6f [o]: 2 *********************************************
0x70 [p]: 0
0x71 [q]: 0
0x72 [r]: 1 **********************
0x73 [s]: 0
0x74 [t]: 0
0x75 [u]: 0
0x76 [v]: 0
0x77 [w]: 0
0x78 [x]: 0
0x79 [y]: 0
0x7a [z]: 0
0x7b [{]: 0
0x7c [|]: 0
0x7d [}]: 0
0x7e [~]: 0
0x7f [?]: 0
0x80 [?]: 0
0x81 [?]: 0
0x82 [?]: 0
0x83 [?]: 0
0x84 [?]: 0
0x85 [?]: 0
0x86 [?]: 0
0x87 [?]: 0
0x88 [?]: 0
0x89 [?]: 0
0x8a [?]: 0
0x8b [?]: 0
0x8c [?]: 0
0x8d [?]: 0
0x8e [?]: 0
0x8f [?]: 0
0x90 [?]: 0
0x91 [?]: 0
0x92 [?]: 0
0x93 [?]: 0
0x94 [?]: 0
0x95 [?]: 0
0x96 [?]: 0
0x97 [?]: 0
0x98 [?]: 0
0x99 [?]: 0
0x9a [?]: 0
0x9b [?]: 0
0x9c [?]: 0
0x9d [?]: 0
0x9e [?]: 0
0x9f [?]: 0
0xa0 [ ]: 0
0xa1 [¡]: 0
0xa2 [¢]: 0
0xa3 [£]: 0
0xa4 [¤]: 0
0xa5 [\]: 0
0xa6 [¦]: 0
0xa7 [§]: 0
0xa8 [ ̈]: 0
0xa9 [©]: 0
0xaa [a]: 0
0xab [«]: 0
0xac [¬]: 0
0xad []: 0
0xae [®]: 0
0xaf [ ̄]: 0
0xb0 [°]: 0
0xb1 [±]: 0
0xb2 [2]: 0
0xb3 [3]: 0
0xb4 [ ́]: 0
0xb5 [μ]: 0
0xb6 [¶]: 0
0xb7 [·]: 0
0xb8 [ ̧]: 0
0xb9 [1]: 0
0xba [o]: 0
0xbb [»]: 0
0xbc [1⁄4]: 0
0xbd [1⁄2]: 0
0xbe [3⁄4]: 0
0xbf [¿]: 0
0xc0 [À]: 0
0xc1 [Á]: 0
0xc2 [Â]: 0
0xc3 [Ã]: 0
0xc4 [Ä]: 0
0xc5 [Å]: 0
0xc6 [Æ]: 0
0xc7 [Ç]: 0
0xc8 [È]: 0
0xc9 [É]: 0
0xca [Ê]: 0
0xcb [Ë]: 0
0xcc [Ì]: 0
0xcd [Í]: 0
0xce [Î]: 0
0xcf [Ï]: 0
0xd0 [Ð]: 0
0xd1 [Ñ]: 0
0xd2 [Ò]: 0
0xd3 [Ó]: 0
0xd4 [Ô]: 0
0xd5 [Õ]: 0
0xd6 [Ö]: 0
0xd7 ×ばつ]: 0
0xd8 [Ø]: 0
0xd9 [Ù]: 0
0xda [Ú]: 0
0xdb [Û]: 0
0xdc [Ü]: 0
0xdd [Ý]: 0
0xde [Þ]: 0
0xdf [ß]: 0
0xe0 [à]: 0
0xe1 [á]: 0
0xe2 [â]: 0
0xe3 [ã]: 0
0xe4 [ä]: 0
0xe5 [å]: 0
0xe6 [æ]: 0
0xe7 [ç]: 0
0xe8 [è]: 0
0xe9 [é]: 0
0xea [ê]: 0
0xeb [ë]: 0
0xec [ì]: 0
0xed [í]: 0
0xee [î]: 0
0xef [ï]: 0
0xf0 [ð]: 0
0xf1 [ñ]: 0
0xf2 [ò]: 0
0xf3 [ó]: 0
0xf4 [ô]: 0
0xf5 [õ]: 0
0xf6 [ö]: 0
0xf7 [÷]: 0
0xf8 [ø]: 0
0xf9 [ù]: 0
0xfa [ú]: 0
0xfb [û]: 0
0xfc [ü]: 0
0xfd [ý]: 0
0xfe [þ]: 0
0xff [ÿ]: 0
Critique request
Please tell me anything that comes to mind.
-
\$\begingroup\$ I think those MultipleExceptions smell like an attempt to use exceptions for controlling normal program flow. Break up the app to smaller modules and store the failure to the result you get from one file. Then combine the results when all files have been processed. Thinking how you would solve this if you had to split the processing to a concurrent executor might help here. \$\endgroup\$TorbenPutkonen– TorbenPutkonen2024年11月15日 20:49:27 +00:00Commented Nov 15, 2024 at 20:49
1 Answer 1
Probably, I should have provided some code snippets in the previous answer.
Closing resources
The following approach doesn't guarantee that InputStream
will be closed:
for (final InputStream is : inputStreamList) {
try {
processInputStream(new BufferedInputStream(is), histogram);
} catch (final IOException e) {
ex.add(e);
}
}
private static void processInputStream(final InputStream is,
final ByteHistogram histogram) throws IOException {
int i;
while ((i = is.read()) != -1) {
histogram.insert(i);
}
is.close();
}
If read()
causes an IOException
, close()
method will not be executed.
To ensure that resources are closed at any circumstances, close()
should be invoked from a finally
block. Or you can use try-with-resources (see example below) as I suggested in the previous answer.
And it's always better to release resources as soon as possible. Deal with streams one by one (open -> read -> close) instead of opening them in butches, and then keeping them idle waiting for one-another to be processed.
Histogram.insert()
My proposition was to introduce a new method insertUnsigned(int)
that matches your current use case and communicates its intent (which is better than plain overloading), and that will coexist with insert(byte)
(I wasn't suggesting modifying the existing method).
Replacing insert(byte)
with insert(int)
changes the semantics of the method (the first was expecting a regular byte value, the second expects an unsigned byte), but the name doesn't reveal this intent.
This creates a possibility to invoke insert(int)
with a byte
argument, and neither the compiler (it will perform widening conversion and find the suitable method), nor you (the name communicate the method intent well enough) will smell that something fishy is happening. And if the given byte
is negative, you'll get an ArrayIndexOutOfBoundsException
.
Refactoring Example
Here's how you can use try-with-resources:
private static void countByteFrequenciesInFile(String fileName,
ByteHistogram histogram) throws IOException {
try (var stream = new BufferedInputStream(new FileInputStream(fileName))) {
int nextByte;
while ((nextByte = stream.read()) != -1) {
histogram.insertUnsigned(nextByte);
}
}
}
public static ByteHistogram filesToHistogram(String[] fileNames) {
var histogram = new ByteHistogram();
var multiIO = new MultipleIOException();
for (String name: fileNames) {
try {
countByteFrequenciesInFile(name, histogram);
} catch (IOException e) {
multiIO.add(e);
}
}
return histogram;
}
To check if all the files exist you can leverage Files.exists()
:
public static List<Path> findMissingFiles(String[] fileNames) {
return Arrays.stream(fileNames)
.map(Path::of)
.filter(not(Files::exists)) // import static java.util.function.Predicate.not;
.toList();
}