1 /*
2 * Autopsy Forensic Browser
3 *
4 * Copyright 2011-2016 Basis Technology Corp.
5 * Contact: carrier <at> sleuthkit <dot> org
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.sleuthkit.autopsy.keywordsearch;
20
21 import java.io.ByteArrayInputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.Reader;
25 import java.io.UnsupportedEncodingException;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.logging.Level;
29 import org.apache.solr.client.solrj.SolrServerException;
30 import org.apache.solr.common.util.ContentStream;
31 import org.apache.solr.common.SolrInputDocument;
32 import org.openide.util.NbBundle;
48
52 class Ingester {
53
54 private static final Logger logger = Logger.getLogger(Ingester.class.getName());
55 private volatile boolean uncommitedIngests = false;
56 private final Server solrServer = KeywordSearch.getServer();
57 private final GetContentFieldsV getContentFieldsV = new GetContentFieldsV();
58 private static Ingester instance;
59
60 //for ingesting chunk as SolrInputDocument (non-content-streaming, by-pass tika)
61 //TODO use a streaming way to add content to /update handler
62 private static final int MAX_DOC_CHUNK_SIZE = 1024 * 1024;
63 private static final String ENCODING = "UTF-8"; //NON-NLS
64
65 private Ingester() {
66 }
67
68 public static synchronized Ingester getDefault() {
69 if (instance == null) {
70 instance = new Ingester();
71 }
72 return instance;
73 }
74
75 @Override
76 @SuppressWarnings("FinalizeDeclaration")
77 protected void finalize() throws Throwable {
78 super.finalize();
79
80 // Warn if files might have been left uncommited.
81 if (uncommitedIngests) {
82 logger.warning("Ingester was used to add files that it never committed."); //NON-NLS
83 }
84 }
85
95 void ingest(AbstractFileStringContentStream afscs) throws IngesterException {
96 Map<String, String> params = getContentFields(afscs.getSourceContent());
97 ingest(afscs, params, afscs.getSourceContent().getSize());
98 }
99
112 void ingest(TextExtractor fe) throws IngesterException {
113 Map<String, String> params = getContentFields(fe.getSourceFile());
114
115 params.put(Server.Schema.NUM_CHUNKS.toString(), Integer.toString(fe.getNumChunks()));
116
117 ingest(new NullContentStream(fe.getSourceFile()), params, 0);
118 }
119
132 void ingest(AbstractFileChunk fec, ByteContentStream bcs, int size) throws IngesterException {
133 AbstractContent sourceContent = bcs.getSourceContent();
134 Map<String, String> params = getContentFields(sourceContent);
135
136 //overwrite id with the chunk id
137 params.put(Server.Schema.ID.toString(),
138 Server.getChunkIdString(sourceContent.getId(), fec.getChunkNumber()));
139
140 ingest(bcs, params, size);
141 }
142
156 void ingest(AbstractFile file, boolean ingestContent) throws IngesterException {
157 if (ingestContent == false || file.isDir()) {
158 ingest(new NullContentStream(file), getContentFields(file), 0);
159 } else {
160 ingest(new FscContentStream(file), getContentFields(file), file.getSize());
161 }
162 }
163
171 private Map<String, String> getContentFields(AbstractContent fsc) {
172 return fsc.accept(getContentFieldsV);
173 }
174
179
180 @Override
182 return new HashMap<>();
183 }
184
185 @Override
186 public Map<String, String>
visit(File f) {
189 return params;
190 }
191
192 @Override
193 public Map<String, String>
visit(DerivedFile df) {
196 return params;
197 }
198
199 @Override
200 public Map<String, String>
visit(Directory d) {
203 return params;
204 }
205
206 @Override
207 public Map<String, String>
visit(LayoutFile lf) {
208 // layout files do not have times
210 }
211
212 @Override
213 public Map<String, String>
visit(LocalFile lf) {
216 return params;
217 }
218
219 @Override
220 public Map<String, String>
visit(SlackFile f) {
223 return params;
224 }
225
231 return params;
232 }
233
235 Map<String, String> params = new HashMap<>();
236 params.put(
Server.
Schema.ID.toString(), Long.toString(af.getId()));
237 try {
238 long dataSourceId = af.getDataSource().getId();
239 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(dataSourceId));
240 } catch (TskCoreException ex) {
241 logger.log(Level.SEVERE, "Could not get data source id to properly index the file {0}", af.getId()); //NON-NLS
242 params.put(
Server.
Schema.IMAGE_ID.toString(), Long.toString(-1));
243 }
244
245 params.put(
Server.
Schema.FILE_NAME.toString(), af.getName());
246 return params;
247 }
248 }
249
266 void ingest(ContentStream cs, Map<String, String> fields, final long size) throws IngesterException {
267 if (fields.get(
Server.
Schema.IMAGE_ID.toString()) == null) {
268 //skip the file, image id unknown
269 String msg = NbBundle.getMessage(this.getClass(),
270 "Ingester.ingest.exception.unknownImgId.msg", cs.getName());
271 logger.log(Level.SEVERE, msg);
272 throw new IngesterException(msg);
273 }
274
275 final byte[] docChunkContentBuf = new byte[MAX_DOC_CHUNK_SIZE];
276 SolrInputDocument updateDoc = new SolrInputDocument();
277
278 for (String key : fields.keySet()) {
279 updateDoc.addField(key, fields.get(key));
280 }
281
282 //using size here, but we are no longer ingesting entire files
283 //size is normally a chunk size, up to 1MB
284 if (size > 0) {
285 // TODO (RC): Use try with resources, adjust exception messages
286 InputStream is = null;
287 int read = 0;
288 try {
289 is = cs.getStream();
290 read = is.read(docChunkContentBuf);
291 } catch (IOException ex) {
292 throw new IngesterException(
293 NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.cantReadStream.msg",
294 cs.getName()));
295 } finally {
296 if (null != is) {
297 try {
298 is.close();
299 } catch (IOException ex) {
300 logger.log(Level.WARNING, "Could not close input stream after reading content, " + cs.getName(), ex); //NON-NLS
301 }
302 }
303 }
304
305 if (read != 0) {
306 String s = "";
307 try {
308 s = new String(docChunkContentBuf, 0, read, ENCODING);
309 // Sanitize by replacing non-UTF-8 characters with caret '^' before adding to index
310 char[] chars = null;
311 for (int i = 0; i < s.length(); i++) {
312 if (!TextUtil.isValidSolrUTF8(s.charAt(i))) {
313 // only convert string to char[] if there is a non-UTF8 character
314 if (chars == null) {
315 chars = s.toCharArray();
316 }
317 chars[i] = '^';
318 }
319 }
320 // check if the string was modified (i.e. there was a non-UTF8 character found)
321 if (chars != null) {
322 s = new String(chars);
323 }
324 } catch (UnsupportedEncodingException ex) {
325 logger.log(Level.SEVERE, "Unsupported encoding", ex); //NON-NLS
326 }
327 updateDoc.addField(Server.Schema.CONTENT.toString(), s);
328 } else {
329 updateDoc.addField(Server.Schema.CONTENT.toString(), "");
330 }
331 } else {
332 //no content, such as case when 0th chunk indexed
333 updateDoc.addField(Server.Schema.CONTENT.toString(), "");
334 }
335
336 try {
337 //TODO consider timeout thread, or vary socket timeout based on size of indexed content
338 solrServer.addDocument(updateDoc);
339 uncommitedIngests = true;
340 } catch (KeywordSearchModuleException ex) {
341 throw new IngesterException(
342 NbBundle.getMessage(this.getClass(), "Ingester.ingest.exception.err.msg", cs.getName()), ex);
343 }
344
345 }
346
354 static int getTimeout(long size) {
355 if (size < 1024 * 1024L) //1MB
356 {
357 return 60;
358 } else if (size < 10 * 1024 * 1024L) //10MB
359 {
360 return 1200;
361 } else if (size < 100 * 1024 * 1024L) //100MB
362 {
363 return 3600;
364 } else {
365 return 3 * 3600;
366 }
367
368 }
369
374 void commit() {
375 try {
376 solrServer.commit();
377 uncommitedIngests = false;
378 } catch (NoOpenCoreException | SolrServerException ex) {
379 logger.log(Level.WARNING, "Error commiting index", ex); //NON-NLS
380 }
381 }
382
387
388 private AbstractFile
f;
389
392 }
393
394 @Override
396 return f.getName();
397 }
398
399 @Override
401 return NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getSrcInfo", f.getId());
402 }
403
404 @Override
406 return null;
407 }
408
409 @Override
411 return f.getSize();
412 }
413
414 @Override
416 return new ReadContentInputStream(f);
417 }
418
419 @Override
421 throw new UnsupportedOperationException(
422 NbBundle.getMessage(this.getClass(), "Ingester.FscContentStream.getReader"));
423 }
424 }
425
430
431 AbstractContent aContent;
432
434 this.aContent = aContent;
435 }
436
437 @Override
439 return aContent.getName();
440 }
441
442 @Override
444 return NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getSrcInfo.text", aContent.getId());
445 }
446
447 @Override
449 return null;
450 }
451
452 @Override
454 return 0L;
455 }
456
457 @Override
459 return new ByteArrayInputStream(new byte[0]);
460 }
461
462 @Override
464 throw new UnsupportedOperationException(
465 NbBundle.getMessage(this.getClass(), "Ingester.NullContentStream.getReader"));
466 }
467 }
468
473 static class IngesterException extends Exception {
474
475 private static final long serialVersionUID = 1L;
476
477 IngesterException(String message, Throwable ex) {
478 super(message, ex);
479 }
480
481 IngesterException(String message) {
482 super(message);
483 }
484 }
485 }
Map< String, String > visit(Directory d)
Map< String, String > visit(SlackFile f)
Map< String, String > defaultVisit(Content cntnt)
Map< String, String > visit(DerivedFile df)
static String getStringTimeISO8601(long epochSeconds, TimeZone tzone)
Map< String, String > visit(File f)
Map< String, String > getCommonFields(AbstractFile af)
Map< String, String > getCommonFileContentFields(Map< String, String > params, AbstractFile file)
Map< String, String > visit(LocalFile lf)
Map< String, String > visit(LayoutFile lf)