| 16 | |
| 17 | /** |
| 18 | * return a plaintext representation of the file |
| 19 | * @param source the file to look into |
| 20 | * @param mimetype the mimetype that has been identified by gnowsis that this file is |
| 21 | * @return null or a string. Null is returned, if no plaintext is in the file. If it could not be |
| 22 | * extracted, an exception is thrown. |
| 23 | * @throws ExtractionException when something goes wrong with extraction |
| 24 | * @throws FileNotFoundException when the file is not existant |
| 25 | */ |
| 26 | public String getPlaintext(File source, String mimetype) throws FileNotFoundException, ExtractionException ; |
| 27 | |
| 28 | |
| 29 | |
| 30 | |
| 31 | |
| 32 | /** |
| 33 | * create a lucene document. |
| 34 | * To see what fields would be needed, look at the top of this class. |
| 35 | * @param file |
| 36 | * @param uri the uri identifying the passed file. You may need it when you add sophisticated rdf information |
| 37 | * @param mimetype the mimetype of the passed file/stream. If your extractor can handle multiple mime-types, this can be handy. |
| 38 | * @param options optional options that may help you. |
| 39 | * @return a lucene document |
| 40 | */ |
| 41 | public Document createLuceneDocument(File file, String uri, String mimetype, Object options) throws IOException, DocumentExtractorException; |
| 42 | |
| 43 | public Document createLuceneDocument(InputStream stream, String uri, String mimetype, Object options) throws IOException, DocumentExtractorException; |
| 44 | |
| 45 | |