From b4fdca34b848c5bf891ca4caae76297878d95a76 Mon Sep 17 00:00:00 2001 From: KevM Date: Fri, 31 Mar 2017 10:37:25 -0500 Subject: [PATCH 1/4] Fix uri extraction - Restored usage of the Tika Content Handler factory. This may get rid of many content issues people have been reporting. - Using WebClient to download the URI contents as the Tika uri stream getter does not do anything. - Made StreamExtractor obsolete as it is no longer used but is public and I didn't want make a breaking change. Closes #84 --- .../stream_text_extraction.cs | 26 ++++++++++++---- src/TikaOnDotNet.Tests/text_extraction.cs | 13 ++++++++ .../Stream/DotNetOutputStreamAdapter.cs | 30 +++++++++++++++++++ .../Stream/StreamTextExtractor.cs | 21 +++++++++++-- .../TextExtractor.cs | 12 ++++---- .../TikaOnDotnet.TextExtraction.csproj | 1 + 6 files changed, 89 insertions(+), 14 deletions(-) create mode 100644 src/TikaOnDotnet.TextExtractor/Stream/DotNetOutputStreamAdapter.cs diff --git a/src/TikaOnDotNet.Tests/stream_text_extraction.cs b/src/TikaOnDotNet.Tests/stream_text_extraction.cs index 3f20455..65fef83 100644 --- a/src/TikaOnDotNet.Tests/stream_text_extraction.cs +++ b/src/TikaOnDotNet.Tests/stream_text_extraction.cs @@ -20,15 +20,29 @@ public virtual void SetUp() } [Test] - public void should_throw_when_given_closed_stream() + public void should_throw_when_given_a_disposed_stream() { - var closedStream = new MemoryStream(); - closedStream.Dispose(); - + var disposedStream = new MemoryStream(); + disposedStream.Dispose(); var bytes = new byte[] {0, 1, 2, 3}; - Action act = () => _cut.Extract(metadata=> TikaInputStream.get(bytes, metadata), closedStream); - act.ShouldThrow().WithInnerException(); + Action act = () => _cut.Extract(metadata=> TikaInputStream.get(bytes, metadata), disposedStream); + + act.ShouldThrow(); + } + + [Test] + public void should_throw_when_given_a_closed_stream() + { + var file = Path.GetTempFileName(); + const int bufferSize = 4096; + var bytes = new byte[] { 0, 1, 2, 3 }; + var closedStream = File.Create(file, bufferSize, FileOptions.DeleteOnClose); + closedStream.Close(); + + Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream); + + act.ShouldThrow(); } } } \ No newline at end of file diff --git a/src/TikaOnDotNet.Tests/text_extraction.cs b/src/TikaOnDotNet.Tests/text_extraction.cs index f603093..67299aa 100644 --- a/src/TikaOnDotNet.Tests/text_extraction.cs +++ b/src/TikaOnDotNet.Tests/text_extraction.cs @@ -2,8 +2,10 @@ using System.IO; using System.Collections.Generic; using System.Linq; +using System.Net; using FluentAssertions; using NUnit.Framework; +using org.apache.tika.io; using TikaOnDotNet.TextExtraction; namespace TikaOnDotNet.Tests @@ -200,5 +202,16 @@ public void should_extract_msg() textExtractionResult.Text.Should().Contain("This is my test file"); textExtractionResult.Metadata["subject"].Should().Be("This is the subject"); } + + [Test] + public void should_extract_uri_contents() + { + const string url = "https://en.wikipedia.org/wiki/Apache_Tika"; + + var textExtractionResult = _cut.Extract(new Uri(url)); + + textExtractionResult.Text.Should().Contain("Apache Tika is a content detection and analysis framework"); + textExtractionResult.Metadata["Uri"].Should().Be(url); + } } } \ No newline at end of file diff --git a/src/TikaOnDotnet.TextExtractor/Stream/DotNetOutputStreamAdapter.cs b/src/TikaOnDotnet.TextExtractor/Stream/DotNetOutputStreamAdapter.cs new file mode 100644 index 0000000..d19b16b --- /dev/null +++ b/src/TikaOnDotnet.TextExtractor/Stream/DotNetOutputStreamAdapter.cs @@ -0,0 +1,30 @@ + +using java.io; + +namespace TikaOnDotNet.TextExtraction.Stream +{ + internal class DotNetOutputStreamAdapter : OutputStream + { + private readonly System.IO.Stream _stream; + + public DotNetOutputStreamAdapter(System.IO.Stream stream) + { + _stream = stream; + } + + public override void write(int b) + { + _stream.WriteByte((byte) b); + } + + public override void write(byte[] b, int off, int len) + { + _stream.Write(b, off, len); + } + + public override void write(byte[] b) + { + _stream.Write(b, 0, b.Length); + } + } +} diff --git a/src/TikaOnDotnet.TextExtractor/Stream/StreamTextExtractor.cs b/src/TikaOnDotnet.TextExtractor/Stream/StreamTextExtractor.cs index faa4efe..5efd7c9 100644 --- a/src/TikaOnDotnet.TextExtractor/Stream/StreamTextExtractor.cs +++ b/src/TikaOnDotnet.TextExtractor/Stream/StreamTextExtractor.cs @@ -1,6 +1,9 @@ using System; using System.Linq; using java.io; +using javax.xml.transform; +using javax.xml.transform.sax; +using javax.xml.transform.stream; using org.apache.tika.metadata; using org.apache.tika.parser; using Parser = org.apache.tika.parser.Parser; @@ -16,17 +19,16 @@ public ExtractionResult Extract(Func streamFactory, Syste var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); + var handler = GetTransformerHandler(outputStream); //use the base class type for the key or parts of Tika won't find a usable parser parseContext.set(typeof(Parser), parser); - var contentHandlerResult = new StreamContentHandler(outputStream); - using (var inputStream = streamFactory(metadata)) { try { - parser.parse(inputStream, contentHandlerResult, metadata, parseContext); + parser.parse(inputStream, handler, metadata, parseContext); } finally { @@ -42,6 +44,19 @@ public ExtractionResult Extract(Func streamFactory, Syste } } + private static TransformerHandler GetTransformerHandler(System.IO.Stream outputStream) + { + var factory = (SAXTransformerFactory)TransformerFactory.newInstance(); + var transformerHandler = factory.newTransformerHandler(); + + transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); + transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + + var outputAdapter = new DotNetOutputStreamAdapter(outputStream); + transformerHandler.setResult(new StreamResult(outputAdapter)); + return transformerHandler; + } + private static ExtractionResult AssembleExtractionResult(Metadata metadata) { var metaDataResult = metadata.names() diff --git a/src/TikaOnDotnet.TextExtractor/TextExtractor.cs b/src/TikaOnDotnet.TextExtractor/TextExtractor.cs index 54d1fcc..b2ae99f 100644 --- a/src/TikaOnDotnet.TextExtractor/TextExtractor.cs +++ b/src/TikaOnDotnet.TextExtractor/TextExtractor.cs @@ -1,5 +1,6 @@ using System; using System.IO; +using System.Net; using java.io; using org.apache.tika.io; using org.apache.tika.metadata; @@ -35,13 +36,13 @@ public TextExtractionResult Extract(byte[] data) public TextExtractionResult Extract(Uri uri) { - var jUri = new java.net.URI(uri.ToString()); return Extract(metadata => { - var result = TikaInputStream.get(jUri, metadata); - metadata.add("Uri", uri.ToString()); - return result; - }); + metadata.add("Uri", uri.ToString()); + var pageBytes = new WebClient().DownloadData(uri); + + return TikaInputStream.get(pageBytes, metadata); + }); } public TextExtractionResult Extract(Func streamFactory) @@ -50,6 +51,7 @@ public TextExtractionResult Extract(Func streamFactory) using (var outputStream = new MemoryStream()) { var streamResult = streamExtractor.Extract(streamFactory, outputStream); + outputStream.Position = 0; using (var reader = new StreamReader(outputStream)) { diff --git a/src/TikaOnDotnet.TextExtractor/TikaOnDotnet.TextExtraction.csproj b/src/TikaOnDotnet.TextExtractor/TikaOnDotnet.TextExtraction.csproj index 6279e88..027cf6c 100644 --- a/src/TikaOnDotnet.TextExtractor/TikaOnDotnet.TextExtraction.csproj +++ b/src/TikaOnDotnet.TextExtractor/TikaOnDotnet.TextExtraction.csproj @@ -46,6 +46,7 @@ Properties\SolutionInfo.cs + From 1f13703171c75ce39a81ad4801b7ff46ab7ae31b Mon Sep 17 00:00:00 2001 From: KevM Date: Fri, 31 Mar 2017 11:47:28 -0500 Subject: [PATCH 2/4] Mark stream content handler as obsolete --- src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs b/src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs index 199428d..6663abf 100644 --- a/src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs +++ b/src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs @@ -1,3 +1,4 @@ +using System; using System.IO; using org.xml.sax; @@ -7,6 +8,7 @@ namespace TikaOnDotNet.TextExtraction.Stream /// Write Tika output to a string builder while squelching the dreadful empty lines. /// NOTE: This type is only public for testing. /// + [Obsolete("This handler was introducing bugs and we restored the built in Tika handerl. Please do not use unless you know what you are doing.", true)] public class StreamContentHandler : ContentHandler { private readonly StreamWriter _writer; From 0ff642a9d98d1901a7b58e53c07b947a5c7ea9b2 Mon Sep 17 00:00:00 2001 From: KevM Date: Fri, 31 Mar 2017 11:58:17 -0500 Subject: [PATCH 3/4] Removed obsolete test file --- .../stream_content_handler.cs | 65 ------------------- 1 file changed, 65 deletions(-) delete mode 100644 src/TikaOnDotNet.Tests/stream_content_handler.cs diff --git a/src/TikaOnDotNet.Tests/stream_content_handler.cs b/src/TikaOnDotNet.Tests/stream_content_handler.cs deleted file mode 100644 index b5497b6..0000000 --- a/src/TikaOnDotNet.Tests/stream_content_handler.cs +++ /dev/null @@ -1,65 +0,0 @@ -using System.IO; -using System.Linq; -using FluentAssertions; -using NUnit.Framework; -using org.xml.sax.ext; -using TikaOnDotNet.TextExtraction.Stream; - -namespace TikaOnDotNet.Tests -{ - [TestFixture] - public class stream_content_handler - { - private StreamContentHandler _cut; - private char[] _expectedCharacters; - private MemoryStream _outputStream; - - [SetUp] - public void SetUp() - { - _expectedCharacters = new[] { 'a', 'b', 'c' }; - _outputStream = new MemoryStream(); - _cut = new StreamContentHandler(_outputStream); - } - - private string GetResult() - { - using (var reader = new StreamReader(_outputStream)) - { - return reader.ReadToEnd(); - } - } - - [Test] - public void should_capture_character_content() - { - _cut.characters(_expectedCharacters, 0, _expectedCharacters.Length); - _cut.endDocument(); - - var result = GetResult(); - result.Length.Should().Be(_expectedCharacters.Length + 2); - result.Take(_expectedCharacters.Length).Should().BeEquivalentTo(_expectedCharacters); - } - - [Test] - public void should_ignore_empty_content() - { - _cut.characters(new char[0], 0, 0); - _cut.endDocument(); - - GetResult().Length.Should().Be(0); - } - - [Test] - public void should_ignore_non_character_content() - { - _cut.startDocument(); - _cut.startElement("wee", "local", "name", new Attributes2Impl()); - _cut.characters(_expectedCharacters, 0, _expectedCharacters.Length); - _cut.endElement("wee", "local", "name"); - _cut.endDocument(); - - GetResult().Length.Should().Be(_expectedCharacters.Length + 2); - } - } -} \ No newline at end of file From 44f97e924ac29d47a8676ef659847d582dc72312 Mon Sep 17 00:00:00 2001 From: KevM Date: Fri, 31 Mar 2017 12:09:20 -0500 Subject: [PATCH 4/4] Don't you hate VS when you delete a file and the .csproj doesn't update? --- src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj b/src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj index 4bb6489..31166d8 100644 --- a/src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj +++ b/src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj @@ -43,7 +43,6 @@ -