Autopsy: /home/carriersleuth/repos/autopsy/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/Chunker.java Source File

Autopsy 4.13.0

Graphical digital forensics platform for The Sleuth Kit and other tools.

File List

Chunker.java

Go to the documentation of this file.

1 /*

2 * Autopsy Forensic Browser

3 *

5 * Contact: carrier <at> sleuthkit <dot> org

6 *

7 * Licensed under the Apache License, Version 2.0 (the "License");

8 * you may not use this file except in compliance with the License.

9 * You may obtain a copy of the License at

10 *

11 * http://www.apache.org/licenses/LICENSE-2.0

12 *

13 * Unless required by applicable law or agreed to in writing, software

14 * distributed under the License is distributed on an "AS IS" BASIS,

15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

16 * See the License for the specific language governing permissions and

17 * limitations under the License.

18 */

19 package org.sleuthkit.autopsy.keywordsearch;

21 import java.io.IOException;

22 import java.io.PushbackReader;

23 import java.io.Reader;

24 import java.nio.charset.Charset;

25 import java.nio.charset.StandardCharsets;

26 import java.text.Normalizer;

27 import java.util.Iterator;

28 import java.util.NoSuchElementException;

29 import javax.annotation.concurrent.NotThreadSafe;

30 import org.sleuthkit.autopsy.coreutils.TextUtil;

31 import org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk;

40 @NotThreadSafe

41 class Chunker implements Iterator<Chunk>, Iterable<Chunk> {

43 //local references to standard encodings

44 private static final Charset UTF_16 = StandardCharsets.UTF_16;

45 private static final Charset UTF_8 = StandardCharsets.UTF_8;

47 //Chunking algorithm paramaters-------------------------------------//

51 private static final int MAX_TOTAL_CHUNK_SIZE = 32760; //bytes

56 private static final int MINIMUM_BASE_CHUNK_SIZE = 30 * 1024; //bytes

61 private static final int MAXIMUM_BASE_CHUNK_SIZE = 31 * 1024; //bytes

66 private static final int WHITE_SPACE_BUFFER_SIZE = 512; //bytes

70 private static final int READ_CHARS_BUFFER_SIZE = 512; //chars

79 private static final int MAX_CHAR_SIZE_INCREASE_IN_BYTES = 10; //bytes

86 private final PushbackReader reader;

90 private final char[] tempChunkBuf = new char[READ_CHARS_BUFFER_SIZE];

95 private int chunkSizeBytes = 0;

102 private int lowerCasedChunkSizeBytes = 0;

107 private boolean endOfReaderReached = false;

111 private Exception ex;

112

118 Chunker(Reader reader) {

119 //Using MAX_TOTAL_CHUNK_SIZE is safe but probably overkill.

120 this.reader = new PushbackReader(reader, MAX_TOTAL_CHUNK_SIZE);

121 }

122

123 @Override

124 public Iterator<Chunk> iterator() {

125 return this;

126 }

127

134 boolean hasException() {

135 return ex != null;

136 }

137

143 public Exception getException() {

144 return ex;

145 }

146

147 @Override

148 public boolean hasNext() {

149 return (ex == null)

150 && (endOfReaderReached == false);

151 }

152

162 private static StringBuilder sanitizeToUTF8(StringBuilder sb) {

163 final int length = sb.length();

164 for (int i = 0; i < length; i++) {

165 if (TextUtil.isValidSolrUTF8(sb.charAt(i)) == false) {

166 sb.replace(i, i + 1, "^");

167 }

168 }

169 return sb;

170 }

171

181 private static StringBuilder replaceInvalidUTF16(String s) {

182 /* encode the string to UTF-16 which does the replcement, see

183 * Charset.encode(), then decode back to a StringBuilder. */

184 return new StringBuilder(UTF_16.decode(UTF_16.encode(s)));

185 }

186

187 private static StringBuilder sanitize(String s) {

188 String normStr = Normalizer.normalize(s, Normalizer.Form.NFKC);

189 return sanitizeToUTF8(replaceInvalidUTF16(normStr));

190

191 }

192

193 @Override

194 public Chunk next() {

195 if (hasNext() == false) {

196 throw new NoSuchElementException("There are no more chunks.");

197 }

198 //reset state for the next chunk

199

200 chunkSizeBytes = 0;

201 lowerCasedChunkSizeBytes = 0;

202 int baseChunkSizeChars = 0;

203 StringBuilder currentChunk = new StringBuilder();

204 StringBuilder currentWindow = new StringBuilder();

205 StringBuilder lowerCasedChunk = new StringBuilder();

206

207 try {

208 readBaseChunk(currentChunk, lowerCasedChunk);

209 baseChunkSizeChars = currentChunk.length(); //save the base chunk length

210 readWindow(currentWindow, lowerCasedChunk);

211 //add the window text to the current chunk.

212 currentChunk.append(currentWindow);

213 if (endOfReaderReached) {

214 /* if we have reached the end of the content,we won't make

215 * another overlapping chunk, so the length of the base chunk

216 * can be extended to the end. */

217 baseChunkSizeChars = currentChunk.length();

218 } else {

219 /* otherwise we will make another chunk, so unread the window */

220 reader.unread(currentWindow.toString().toCharArray());

221 }

222 } catch (Exception ioEx) {

223 /* Save the exception, which will cause hasNext() to return false,

224 * and break any chunking loop in client code. */

225 ex = ioEx;

226 }

227

228 //sanitize the text and return a Chunk object, that includes the base chunk length.

229 return new Chunk(currentChunk, baseChunkSizeChars, lowerCasedChunk);

230 }

231

237 private void readBaseChunk(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

238 //read the chunk until the minimum base chunk size

239 readHelper(MINIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);

240

241 //keep reading until the maximum base chunk size or white space is reached.

242 readToWhiteSpaceHelper(MAXIMUM_BASE_CHUNK_SIZE, currentChunk, lowerCasedChunk);

243 }

244

250 private void readWindow(StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

251 //read the window, leaving some room to look for white space to break at.

252 readHelper(MAX_TOTAL_CHUNK_SIZE - WHITE_SPACE_BUFFER_SIZE, currentChunk, lowerCasedChunk);

253

254 //keep reading until the max chunk size, or until whitespace is reached.

255 readToWhiteSpaceHelper(MAX_TOTAL_CHUNK_SIZE, currentChunk, lowerCasedChunk);

256 }

257

266 private void readHelper(int maxBytes, StringBuilder currentSegment, StringBuilder currentLowerCasedSegment) throws IOException {

267 int charsRead = 0;

268 //read chars up to maxBytes, or the end of the reader.

269 while ((chunkSizeBytes < maxBytes) && (lowerCasedChunkSizeBytes < maxBytes)

270 && (endOfReaderReached == false)) {

271 charsRead = reader.read(tempChunkBuf, 0, READ_CHARS_BUFFER_SIZE);

272 if (-1 == charsRead) {

273 //this is the last chunk

274 endOfReaderReached = true;

275 return;

276 } else {

277 //if the last char might be part of a surroate pair, unread it.

278 final char lastChar = tempChunkBuf[charsRead - 1];

279 if (Character.isHighSurrogate(lastChar)) {

280 charsRead--;

281 reader.unread(lastChar);

282 }

283

284 //cleanup any invalid utf-16 sequences

285 StringBuilder chunkSegment = sanitize(new String(tempChunkBuf, 0, charsRead));

286

287 //get the length in utf8 bytes of the read chars

288 int segmentSize = chunkSegment.toString().getBytes(UTF_8).length;

289

290 // lower case the string and get it's size. NOTE: lower casing can

291 // change the size of the string!

292 String lowerCasedSegment = chunkSegment.toString().toLowerCase();

293 int lowerCasedSegmentSize = lowerCasedSegment.getBytes(UTF_8).length;

294

295 //if it will not put us past maxBytes

296 if ((chunkSizeBytes + segmentSize < maxBytes) && (lowerCasedChunkSizeBytes + lowerCasedSegmentSize < maxBytes)) {

297 //add it to the chunk

298 currentSegment.append(chunkSegment);

299 chunkSizeBytes += segmentSize;

300

301 currentLowerCasedSegment.append(lowerCasedSegment);

302 lowerCasedChunkSizeBytes += lowerCasedSegmentSize;

303 } else {

304 //unread it, and break out of read loop.

305 reader.unread(tempChunkBuf, 0, charsRead);

306 return;

307 }

308 }

309 }

310 }

311

320 private void readToWhiteSpaceHelper(int maxBytes, StringBuilder currentChunk, StringBuilder lowerCasedChunk) throws IOException {

321 int charsRead = 0;

322 boolean whitespaceFound = false;

323 //read 1 char at a time up to maxBytes, whitespaceFound, or we reach the end of the reader.

324 while ((chunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)

325 && (lowerCasedChunkSizeBytes < maxBytes - MAX_CHAR_SIZE_INCREASE_IN_BYTES)

326 && (whitespaceFound == false)

327 && (endOfReaderReached == false)) {

328 charsRead = reader.read(tempChunkBuf, 0, 1);

329 if (-1 == charsRead) {

330 //this is the last chunk

331 endOfReaderReached = true;

332 return;

333 } else {

334 //if the last charcter might be part of a surroate pair, read another char

335 final char ch = tempChunkBuf[0];

336 String chunkSegment;

337 if (Character.isHighSurrogate(ch)) {

338 //read another char into the buffer.

339 charsRead = reader.read(tempChunkBuf, 1, 1);

340 if (charsRead == -1) {

341 //this is the last chunk, so just drop the unpaired surrogate

342 endOfReaderReached = true;

343 return;

344 } else {

345 //try to use the pair together.

346 chunkSegment = new String(tempChunkBuf, 0, 2);

347 }

348 } else {

349 //one char

350 chunkSegment = new String(tempChunkBuf, 0, 1);

351 }

352

353 //cleanup any invalid utf-16 sequences

354 StringBuilder sanitizedChunkSegment = sanitize(chunkSegment);

355 //check for whitespace.

356 whitespaceFound = Character.isWhitespace(sanitizedChunkSegment.codePointAt(0));

357 //add read chars to the chunk and update the length.

358 currentChunk.append(sanitizedChunkSegment);

359 chunkSizeBytes += sanitizedChunkSegment.toString().getBytes(UTF_8).length;

360

361 // lower case the string and get it's size. NOTE: lower casing can

362 // change the size of the string.

363 String lowerCasedSegment = sanitizedChunkSegment.toString().toLowerCase();

364 lowerCasedChunk.append(lowerCasedSegment);

365 lowerCasedChunkSizeBytes += lowerCasedSegment.getBytes(UTF_8).length;

366 }

367 }

368 }

369

374 static class Chunk {

375

376 private final StringBuilder sb;

377 private final int baseChunkSizeChars;

378 private final StringBuilder lowerCasedChunk;

379

380 Chunk(StringBuilder sb, int baseChunkSizeChars, StringBuilder lowerCasedChunk) {

381 this.sb = sb;

382 this.baseChunkSizeChars = baseChunkSizeChars;

383 this.lowerCasedChunk = lowerCasedChunk;

384 }

385

391 @Override

392 public String toString() {

393 return sb.toString();

394 }

395

401 public String geLowerCasedChunk() {

402 return lowerCasedChunk.toString();

403 }

404

410 int getBaseChunkLength() {

411 return baseChunkSizeChars;

412 }

413 }

414 }

org.sleuthkit

org

org.sleuthkit.autopsy.coreutils

Definition: AppSQLiteDB.java:19

org.sleuthkit.autopsy.keywordsearch.Chunker

Definition: Chunker.java:41

org.sleuthkit.autopsy.keywordsearch.Chunker.Chunk

Definition: Chunker.java:374

org.sleuthkit.autopsy.coreutils.TextUtil

Definition: TextUtil.java:26

org.sleuthkit.autopsy

org.sleuthkit.autopsy.keywordsearch

Definition: AccountsText.java:19