1 /*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2017 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.sleuthkit.autopsy.keywordsearch;
20
21 import java.io.IOException;
22 import java.io.PushbackReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
26 import java.util.Iterator;
27 import java.util.NoSuchElementException;
28 import javax.annotation.concurrent.NotThreadSafe;
31
39 @NotThreadSafe
40 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {
41
42 //local references to standard encodings
43 private static final Charset UTF_16 = StandardCharsets.UTF_16;
44 private static final Charset UTF_8 = StandardCharsets.UTF_8;
45
46 //Chunking algorithm paramaters-------------------------------------//
48 private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes
51 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes
54 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes
57 private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes
59 private static final int READ_CHARS_BUFFER_SIZE = 512; //chars
60
62
64 private final PushbackReader reader;
66 private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];
67
69 private int chunkSizeBytes = 0;
72 private boolean endOfReaderReached = false;
74 private Exception ex;
75
81 Chunker(Reader reader) {
82 //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.
83 this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);
84 }
85
86 @Override
87 public Iterator<Chunk> iterator() {
88 return this;
89 }
90
97 boolean hasException() {
98 return ex != null;
99 }
100
106 public Exception getException() {
107 return ex;
108 }
109
110 @Override
111 public boolean hasNext() {
112 return (ex == null)
113 && (endOfReaderReached == false);
114 }
115
125 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {
126 final int length = sb.length();
127 for (int i = 0; i < length; i++) {
128 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {
129 sb.replace(i, i + 1, "^");
130 }
131 }
132 return sb;
133 }
134
144 private static StringBuilder replaceInvalidUTF16(String s) {
145 /* encode the string to UTF-16 which does the replcement, see
146 * Charset.encode(), then decode back to a StringBuilder. */
147 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));
148 }
149
150 private static StringBuilder sanitize(String s) {
151 return sanitizeToUTF8(replaceInvalidUTF16(s));
152 }
153
154 @Override
155 public Chunk next() {
156 if (hasNext() == false) {
157 throw new NoSuchElementException("There are no more chunks.");
158 }
159 //reset state for the next chunk
160
161 chunkSizeBytes = 0;
162 int baseChunkSizeChars = 0;
163 StringBuilder currentChunk = new StringBuilder();
164 StringBuilder currentWindow = new StringBuilder();
165
166 try {
167 currentChunk.append(readBaseChunk());
168 baseChunkSizeChars = currentChunk.length(); //save the base chunk length
169 currentWindow.append(readWindow());
170 //add the window text to the current chunk.
171 currentChunk.append(currentWindow);
172 if (endOfReaderReached) {
173 /* if we have reached the end of the content,we won't make
174 * another overlapping chunk, so the length of the base chunk
175 * can be extended to the end. */
176 baseChunkSizeChars = currentChunk.length();
177 } else {
178 /* otherwise we will make another chunk, so unread the window */
179 reader.unread(currentWindow.toString().toCharArray());
180 }
181 } catch (Exception ioEx) {
182 /* Save the exception, which will cause hasNext() to return false,
183 * and break any chunking loop in client code. */
184 ex = ioEx;
185 }
186
187 //sanitize the text and return a Chunk object, that includes the base chunk length.
188 return new Chunk(currentChunk, baseChunkSizeChars, chunkSizeBytes);
189 }
190
196 private StringBuilder readBaseChunk() throws IOException {
197 StringBuilder currentChunk = new StringBuilder();
198 //read the chunk until the minimum base chunk size
199 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk);
200
201 //keep reading until the maximum base chunk size or white space is reached.
202 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk);
203 return currentChunk;
204 }
205
211 private StringBuilder readWindow() throws IOException {
212 StringBuilder currentWindow = new StringBuilder();
213 //read the window, leaving some room to look for white space to break at.
214 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentWindow);
215
216 //keep reading until the max chunk size, or until whitespace is reached.
217 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentWindow);
218 return currentWindow;
219 }
220
229 private void readHelper(int maxBytes, StringBuilder currentSegment) throws IOException {
230 int charsRead = 0;
231 //read chars up to maxBytes, or the end of the reader.
232 while ((chunkSizeBytes < maxBytes)
233 && (endOfReaderReached == false)) {
234 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);
235 if (-1 == charsRead) {
236 //this is the last chunk
237 endOfReaderReached = true;
238 return;
239 } else {
240 //if the last char might be part of a surroate pair, unread it.
241 final char lastChar = tempChunkBuf[charsRead - 1];
242 if (Character.isHighSurrogate(lastChar)) {
243 charsRead--;
244 reader.unread(lastChar);
245 }
246
247 //cleanup any invalid utf-16 sequences
248 StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));
249
250 //get the length in utf8 bytes of the read chars
251 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;
252
253 //if it will not put us past maxBytes
254 if (chunkSizeBytes + segmentSize < maxBytes) {
255 //add it to the chunk
256 currentSegment.append(chunkSegment);
257 chunkSizeBytes += segmentSize;
258 } else {
259 //unread it, and break out of read loop.
260 reader.unread(tempChunkBuf, 0, charsRead);
261 return;
262 }
263 }
264 }
265 }
266
275 private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk) throws IOException {
276 int charsRead = 0;
277 boolean whitespaceFound = false;
278 //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.
279 while ((chunkSizeBytes < maxBytes)
280 && (whitespaceFound == false)
281 && (endOfReaderReached == false)) {
282 charsRead = reader.read(tempChunkBuf, 0, 1);
283 if (-1 == charsRead) {
284 //this is the last chunk
285 endOfReaderReached = true;
286 return;
287 } else {
288 //if the last charcter might be part of a surroate pair, read another char
289 final char ch = tempChunkBuf[0];
290 String chunkSegment;
291 if (Character.isHighSurrogate(ch)) {
292 //read another char into the buffer.
293 charsRead = reader.read(tempChunkBuf, 1, 1);
294 if (charsRead == -1) {
295 //this is the last chunk, so just drop the unpaired surrogate
296 endOfReaderReached = true;
297 return;
298 } else {
299 //try to use the pair together.
300 chunkSegment = new String(tempChunkBuf, 0, 2);
301 }
302 } else {
303 //one char
304 chunkSegment = new String(tempChunkBuf, 0, 1);
305 }
306
307 //cleanup any invalid utf-16 sequences
308 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);
309 //check for whitespace.
310 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));
311 //add read chars to the chunk and update the length.
312 currentChunk.append(sanitizedChunkSegment);
313 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;
314 }
315 }
316 }
317
322 static class Chunk {
323
324 private final StringBuilder sb;
325 private final int baseChunkSizeChars;
326 private final int chunkSizeBytes;
327
328 Chunk(StringBuilder sb, int baseChunkSizeChars, int chunkSizeBytes) {
329 this.sb = sb;
330 this.baseChunkSizeChars = baseChunkSizeChars;
331 this.chunkSizeBytes = chunkSizeBytes;
332 }
333
339 @Override
340 public String toString() {
341 return sb.toString();
342 }
343
349 public int getChunkSizeBytes() {
350 return chunkSizeBytes;
351 }
352
358 int getBaseChunkLength() {
359 return baseChunkSizeChars;
360 }
361 }
362 }