1 /*
 2  * Autopsy Forensic Browser
 3  *
 4  * Copyright 2011-2017 Basis Technology Corp.
 5  * Contact: carrier <at> sleuthkit <dot> org
 6  *
 7  * Licensed under the Apache License, Version 2.0 (the "License");
 8  * you may not use this file except in compliance with the License.
 9  * You may obtain a copy of the License at
 10  *
 11  * http://www.apache.org/licenses/LICENSE-2.0
 12  *
 13  * Unless required by applicable law or agreed to in writing, software
 14  * distributed under the License is distributed on an "AS IS" BASIS,
 15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 package org.sleuthkit.autopsy.keywordsearch;
 20 
 21 import java.io.IOException;
 22 import java.io.PushbackReader;
 23 import java.io.Reader;
 24 import java.nio.charset.Charset;
 25 import java.nio.charset.StandardCharsets;
 26 import java.util.Iterator;
 27 import java.util.NoSuchElementException;
 28 import javax.annotation.concurrent.NotThreadSafe;
 31 
 39 @NotThreadSafe
 40 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
 41 
 42  //local references to standard encodings
 43  private static final Charset UTF_16 = StandardCharsets.UTF_16;
 44  private static final Charset UTF_8 = StandardCharsets.UTF_8;
 45 
 46  //Chunking algorithm paramaters-------------------------------------//
 48  private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
 51  private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
 54  private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
 57  private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
 59  private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
 60 
 62 
 64  private final PushbackReader reader;
 66  private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
 67 
 69  private int chunkSizeBytes = 0;
 72  private boolean endOfReaderReached = false;
 74  private Exception ex;
 75 
 81  Chunker(Reader reader) {
 82  //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
 83  this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
 84  }
 85 
 86  @Override
 87  public Iterator<Chunk> iterator() {
 88  return this;
 89  }
 90 
 97  boolean hasException() {
 98  return ex != null;
 99  }
 100 
 106  public Exception getException() {
 107  return ex;
 108  }
 109 
 110  @Override
 111  public boolean hasNext() {
 112  return (ex == null)
 113  && (endOfReaderReached == false);
 114  }
 115 
 125  private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
 126  final int length = sb.length();
 127  for (int i = 0; i < length; i++) {
 128  if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
 129  sb.replace(i, i + 1, "^");
 130  }
 131  }
 132  return sb;
 133  }
 134 
 144  private static StringBuilder replaceInvalidUTF16(String s) {
 145  /* encode the string to UTF-16 which does the replcement, see
 146  * Charset.encode(), then decode back to a StringBuilder. */
 147  return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
 148  }
 149 
 150  private static StringBuilder sanitize(String s) {
 151  return sanitizeToUTF8(replaceInvalidUTF16(s));
 152  }
 153 
 154  @Override
 155  public Chunk next() {
 156  if (hasNext() == false) {
 157  throw new NoSuchElementException("There are no more chunks.");
 158  }
 159  //reset state for the next chunk
 160 
 161  chunkSizeBytes = 0;
 162  int baseChunkSizeChars = 0;
 163  StringBuilder currentChunk = new StringBuilder();
 164  StringBuilder currentWindow = new StringBuilder();
 165 
 166  try {
 167  currentChunk.append(readBaseChunk());
 168  baseChunkSizeChars = currentChunk.length(); //save the base chunk length
 169  currentWindow.append(readWindow());
 170  //add the window text to the current chunk.
 171  currentChunk.append(currentWindow);
 172  if (endOfReaderReached) {
 173  /* if we have reached the end of the content,we won't make
 174  * another overlapping chunk, so the length of the base chunk
 175  * can be extended to the end. */
 176  baseChunkSizeChars = currentChunk.length();
 177  } else {
 178  /* otherwise we will make another chunk, so unread the window */
 179  reader.unread(currentWindow.toString().toCharArray());
 180  }
 181  } catch (Exception ioEx) {
 182  /* Save the exception, which will cause hasNext() to return false,
 183  * and break any chunking loop in client code. */
 184  ex = ioEx;
 185  }
 186  
 187  //sanitize the text and return a Chunk object, that includes the base chunk length.
 188  return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
 189  }
 190 
 196  private StringBuilder readBaseChunk() throws IOException {
 197  StringBuilder currentChunk = new StringBuilder();
 198  //read the chunk until the minimum base chunk size
 199  readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
 200 
 201  //keep reading until the maximum base chunk size or white space is reached.
 202  readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
 203  return currentChunk;
 204  }
 205 
 211  private StringBuilder readWindow() throws IOException {
 212  StringBuilder currentWindow = new StringBuilder();
 213  //read the window, leaving some room to look for white space to break at.
 214  readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
 215 
 216  //keep reading until the max chunk size, or until whitespace is reached.
 217  readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
 218  return currentWindow;
 219  }
 220 
 229  private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
 230  int charsRead = 0;
 231  //read chars up to maxBytes, or the end of the reader.
 232  while ((chunkSizeBytes < maxBytes)
 233  && (endOfReaderReached == false)) {
 234  charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
 235  if (-1 == charsRead) {
 236  //this is the last chunk
 237  endOfReaderReached = true;
 238  return;
 239  } else {
 240  //if the last char might be part of a surroate pair, unread it.
 241  final char lastChar = tempChunkBuf[charsRead - 1];
 242  if (Character.isHighSurrogate(lastChar)) {
 243  charsRead--;
 244  reader.unread(lastChar);
 245  }
 246 
 247  //cleanup any invalid utf-16 sequences
 248  StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
 249 
 250  //get the length in utf8 bytes of the read chars
 251  int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
 252 
 253  //if it will not put us past maxBytes
 254  if (chunkSizeBytes + segmentSize < maxBytes) {
 255  //add it to the chunk
 256  currentSegment.append(chunkSegment);
 257  chunkSizeBytes += segmentSize;
 258  } else {
 259  //unread it, and break out of read loop.
 260  reader.unread(tempChunkBuf, 0, charsRead);
 261  return;
 262  }
 263  }
 264  }
 265  }
 266 
 275  private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
 276  int charsRead = 0;
 277  boolean whitespaceFound = false;
 278  //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
 279  while ((chunkSizeBytes < maxBytes)
 280  && (whitespaceFound == false)
 281  && (endOfReaderReached == false)) {
 282  charsRead = reader.read(tempChunkBuf, 0, 1);
 283  if (-1 == charsRead) {
 284  //this is the last chunk
 285  endOfReaderReached = true;
 286  return;
 287  } else {
 288  //if the last charcter might be part of a surroate pair, read another char
 289  final char ch = tempChunkBuf[0];
 290  String chunkSegment;
 291  if (Character.isHighSurrogate(ch)) {
 292  //read another char into the buffer.
 293  charsRead = reader.read(tempChunkBuf, 1, 1);
 294  if (charsRead == -1) {
 295  //this is the last chunk, so just drop the unpaired surrogate
 296  endOfReaderReached = true;
 297  return;
 298  } else {
 299  //try to use the pair together.
 300  chunkSegment = new String(tempChunkBuf, 0, 2);
 301  }
 302  } else {
 303  //one char
 304  chunkSegment = new String(tempChunkBuf, 0, 1);
 305  }
 306 
 307  //cleanup any invalid utf-16 sequences
 308  StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
 309  //check for whitespace.
 310  whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
 311  //add read chars to the chunk and update the length.
 312  currentChunk.append(sanitizedChunkSegment);
 313  chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
 314  }
 315  }
 316  }
 317 
 322  static class Chunk {
 323 
 324  private final StringBuilder sb;
 325  private final int baseChunkSizeChars;
 326  private final int chunkSizeBytes;
 327 
 328  Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
 329  this.sb = sb;
 330  this.baseChunkSizeChars = baseChunkSizeChars;
 331  this.chunkSizeBytes = chunkSizeBytes;
 332  }
 333 
 339  @Override
 340  public String toString() {
 341  return sb.toString();
 342  }
 343 
 349  public int getChunkSizeBytes() {
 350  return chunkSizeBytes;
 351  }
 352 
 358  int getBaseChunkLength() {
 359  return baseChunkSizeChars;
 360  }
 361  }
 362 }