@Override public void write(final Document document, final Reader reader) throws IOException { if (outputMetadata) { writeMetadata(document); } // A PrintStream should never throw an IOException: the exception would always come from the input stream. // There's no need to use a TaggedOutputStream or catch IOExceptions. copy(reader, stream); // Add an extra newline to signify the end of the text. stream.println(); if (stream.checkError()) { throw new TaggedIOException(new IOException("Error writing to print stream."), this); } // Write out child documents, if any. for (EmbeddedDocument embed: document.getEmbeds()) { try (final Reader embedReader = embed.getReader()) { write(embed, embedReader); } } }
@Override public void writeMetadata(final Document document) throws IOException { final Metadata metadata = document.getMetadata(); Path outputPath = getOutputPath(document); outputPath = outputPath.getFileSystem().getPath(outputPath.toString() + ".json"); logger.info(String.format("Outputting metadata to file: \"%s\".", outputPath)); try (final JsonGenerator jsonGenerator = new JsonFactory().createGenerator(outputPath.toFile(), JsonEncoding.UTF8)) { jsonGenerator.useDefaultPrettyPrinter(); jsonGenerator.writeStartObject(); new MetadataTransformer(metadata, fields).transform(jsonGenerator::writeStringField, (name, values)-> { jsonGenerator.writeArrayFieldStart(name); jsonGenerator.writeStartArray(); for (String value: values) { jsonGenerator.writeString(value); } }); jsonGenerator.writeEndObject(); jsonGenerator.writeRaw('\n'); } catch (IOException e) { throw new TaggedIOException(new IOException("Unable to output JSON."), this); } }
/** * Extract and spew content from a document. This method is the same as {@link #extract(Document, Spewer)} with * the exception that the document will be skipped if the reporter returns {@literal false} for a call to * {@link Reporter#skip(Document)}. * * If the document is not skipped, then the result of the extraction is passed to the reporter in a call to * {@link Reporter#save(Document, ExtractionStatus, Exception)}. * * @param document document to extract from * @param spewer endpoint to write to * @param reporter used to check whether the document should be skipped and save extraction status */ public void extract(final Document document, final Spewer spewer, final Reporter reporter) { Objects.requireNonNull(reporter); if (reporter.skip(document)) { logger.info(String.format("File already extracted; skipping: \"%s\".", document)); return; } ExtractionStatus status = ExtractionStatus.SUCCESS; Exception exception = null; try { extract(document, spewer); } catch (final Exception e) { status = status(e, spewer); log(e, status, document); exception = e; } // For tagged IO exceptions, discard the tag, which is either unwanted or not serializable. if (null != exception && (exception instanceof TaggedIOException)) { exception = ((TaggedIOException) exception).getCause(); } reporter.save(document, status, exception); }
/** * Convert the given {@link Exception} into an {@link ExtractionStatus} for addition to a report. * * Logs an appropriate message depending on the exception. * * @param e the exception to convert and log * @return the resulting status */ private ExtractionStatus status(final Exception e, final Spewer spewer) { if (TaggedIOException.isTaggedWith(e, spewer)) { return ExtractionStatus.FAILURE_NOT_SAVED; } if (TaggedIOException.isTaggedWith(e, MetadataTransformer.class)) { return ExtractionStatus.FAILURE_NOT_PARSED; } if (e instanceof FileNotFoundException) { return ExtractionStatus.FAILURE_NOT_FOUND; } if (!(e instanceof IOException)) { return ExtractionStatus.FAILURE_UNKNOWN; } final Throwable cause = e.getCause(); if (cause instanceof EncryptedDocumentException) { return ExtractionStatus.FAILURE_NOT_DECRYPTED; } // TIKA-198: IOExceptions thrown by parsers will be wrapped in a TikaException. // This helps us differentiate input stream exceptions from output stream exceptions. // https://issues.apache.org/jira/browse/TIKA-198 if (cause instanceof TikaException) { return ExtractionStatus.FAILURE_NOT_PARSED; } return ExtractionStatus.FAILURE_UNREADABLE; }
@Override public void write(final Document document, final Reader reader) throws IOException { final Path outputPath = getOutputPath(document); // Add the output extension. Path contentsOutputPath; if (null != outputExtension) { contentsOutputPath = outputPath.getFileSystem().getPath(outputPath.toString() + "." + outputExtension); } else { contentsOutputPath = outputPath; } logger.info(String.format("Outputting to file: \"%s\".", contentsOutputPath)); // Make the required directories. final Path outputParent = contentsOutputPath.getParent(); if (null != outputParent) { final File outputFileParent = outputParent.toFile(); final boolean madeDirs = outputFileParent.mkdirs(); // The {@link File#mkdirs} method will return false if the path already exists. if (!madeDirs && !outputFileParent.isDirectory()) { throw new TaggedIOException(new IOException(String.format("Unable to make directories for file: \"%s\".", contentsOutputPath)), this); } } TaggedOutputStream tagged = null; // #copy buffers the input so there's no need to use an output buffer. try (final OutputStream output = Files.newOutputStream(contentsOutputPath)) { tagged = new TaggedOutputStream(output); copy(reader, tagged); } catch (IOException e) { if (null != tagged && tagged.isCauseOf(e)) { throw new TaggedIOException(new IOException(String.format("Error writing output to file: \"%s\".", contentsOutputPath), e), this); } else { throw e; } } if (outputMetadata) { writeMetadata(document); } }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return <code>true</code> if the exception was thrown by this stream, * <code>false</code> otherwise */ public boolean isCauseOf(Exception exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Re-throws the original exception thrown by this stream. This method * first checks whether the given exception is a {@link TaggedIOException} * wrapper created by this decorator, and then unwraps and throws the * original wrapped exception. Returns normally if the exception was * not thrown by this stream. * * @param exception an exception * @throws IOException original exception, if any, thrown by this stream */ public void throwIfCauseOf(Exception exception) throws IOException { TaggedIOException.throwCauseIfTaggedWith(exception, tag); }
/** * Tags any IOExceptions thrown, wrapping and re-throwing. * * @param e The IOException thrown * @throws IOException if an I/O error occurs */ @Override protected void handleIOException(IOException e) throws IOException { throw new TaggedIOException(e, tag); }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return <code>true</code> if the exception was thrown by this stream, * <code>false</code> otherwise */ public boolean isCauseOf(Throwable exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Re-throws the original exception thrown by this stream. This method * first checks whether the given exception is a {@link TaggedIOException} * wrapper created by this decorator, and then unwraps and throws the * original wrapped exception. Returns normally if the exception was * not thrown by this stream. * * @param throwable an exception * @throws IOException original exception, if any, thrown by this stream */ public void throwIfCauseOf(Throwable throwable) throws IOException { TaggedIOException.throwCauseIfTaggedWith(throwable, tag); }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return {@code true} if the exception was thrown by this stream, * {@code false} otherwise */ public boolean isCauseOf(Exception exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return {@code true} if the exception was thrown by this stream, * {@code false} otherwise */ public boolean isCauseOf(Throwable exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return {@code true} if the exception was thrown by this stream, * {@code false} otherwise */ public boolean isCauseOf(final Exception exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Re-throws the original exception thrown by this stream. This method * first checks whether the given exception is a {@link TaggedIOException} * wrapper created by this decorator, and then unwraps and throws the * original wrapped exception. Returns normally if the exception was * not thrown by this stream. * * @param exception an exception * @throws IOException original exception, if any, thrown by this stream */ public void throwIfCauseOf(final Exception exception) throws IOException { TaggedIOException.throwCauseIfTaggedWith(exception, tag); }
/** * Tags any IOExceptions thrown, wrapping and re-throwing. * * @param e The IOException thrown * @throws IOException if an I/O error occurs */ @Override protected void handleIOException(final IOException e) throws IOException { throw new TaggedIOException(e, tag); }
/** * Tests if the given exception was caused by this stream. * * @param exception an exception * @return {@code true} if the exception was thrown by this stream, * {@code false} otherwise */ public boolean isCauseOf(final Throwable exception) { return TaggedIOException.isTaggedWith(exception, tag); }
/** * Re-throws the original exception thrown by this stream. This method * first checks whether the given exception is a {@link TaggedIOException} * wrapper created by this decorator, and then unwraps and throws the * original wrapped exception. Returns normally if the exception was * not thrown by this stream. * * @param throwable an exception * @throws IOException original exception, if any, thrown by this stream */ public void throwIfCauseOf(final Throwable throwable) throws IOException { TaggedIOException.throwCauseIfTaggedWith(throwable, tag); }