Skip to content

Commit

Permalink
Merge pull request #93 from KevM/fix-84
Browse files Browse the repository at this point in the history
Fix uri extraction
  • Loading branch information
KevM authored Apr 22, 2017
2 parents 378e096 + 44f97e9 commit 78bcaed
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 80 deletions.
1 change: 0 additions & 1 deletion src/TikaOnDotNet.Tests/TikaOnDotNet.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
</ItemGroup>
<ItemGroup>
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="stream_content_handler.cs" />
<Compile Include="stream_text_extraction.cs" />
<Compile Include="text_extraction.cs" />
</ItemGroup>
Expand Down
65 changes: 0 additions & 65 deletions src/TikaOnDotNet.Tests/stream_content_handler.cs

This file was deleted.

26 changes: 20 additions & 6 deletions src/TikaOnDotNet.Tests/stream_text_extraction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,29 @@ public virtual void SetUp()
}

[Test]
public void should_throw_when_given_closed_stream()
public void should_throw_when_given_a_disposed_stream()
{
var closedStream = new MemoryStream();
closedStream.Dispose();

var disposedStream = new MemoryStream();
disposedStream.Dispose();
var bytes = new byte[] {0, 1, 2, 3};
Action act = () => _cut.Extract(metadata=> TikaInputStream.get(bytes, metadata), closedStream);

act.ShouldThrow<TextExtractionException>().WithInnerException<ArgumentException>();
Action act = () => _cut.Extract(metadata=> TikaInputStream.get(bytes, metadata), disposedStream);

act.ShouldThrow<TextExtractionException>();
}

[Test]
public void should_throw_when_given_a_closed_stream()
{
var file = Path.GetTempFileName();
const int bufferSize = 4096;
var bytes = new byte[] { 0, 1, 2, 3 };
var closedStream = File.Create(file, bufferSize, FileOptions.DeleteOnClose);
closedStream.Close();

Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream);

act.ShouldThrow<TextExtractionException>();
}
}
}
13 changes: 13 additions & 0 deletions src/TikaOnDotNet.Tests/text_extraction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using FluentAssertions;
using NUnit.Framework;
using org.apache.tika.io;
using TikaOnDotNet.TextExtraction;

namespace TikaOnDotNet.Tests
Expand Down Expand Up @@ -200,5 +202,16 @@ public void should_extract_msg()
textExtractionResult.Text.Should().Contain("This is my test file");
textExtractionResult.Metadata["subject"].Should().Be("This is the subject");
}

[Test]
public void should_extract_uri_contents()
{
const string url = "https://en.wikipedia.org/wiki/Apache_Tika";

var textExtractionResult = _cut.Extract(new Uri(url));

textExtractionResult.Text.Should().Contain("Apache Tika is a content detection and analysis framework");
textExtractionResult.Metadata["Uri"].Should().Be(url);
}
}
}
30 changes: 30 additions & 0 deletions src/TikaOnDotnet.TextExtractor/Stream/DotNetOutputStreamAdapter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

using java.io;

namespace TikaOnDotNet.TextExtraction.Stream
{
internal class DotNetOutputStreamAdapter : OutputStream
{
private readonly System.IO.Stream _stream;

public DotNetOutputStreamAdapter(System.IO.Stream stream)
{
_stream = stream;
}

public override void write(int b)
{
_stream.WriteByte((byte) b);
}

public override void write(byte[] b, int off, int len)
{
_stream.Write(b, off, len);
}

public override void write(byte[] b)
{
_stream.Write(b, 0, b.Length);
}
}
}
2 changes: 2 additions & 0 deletions src/TikaOnDotnet.TextExtractor/Stream/StreamContentHandler.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using System.IO;
using org.xml.sax;

Expand All @@ -7,6 +8,7 @@ namespace TikaOnDotNet.TextExtraction.Stream
/// Write Tika output to a string builder while squelching the dreadful empty lines.
/// NOTE: This type is only public for testing.
/// </summary>
[Obsolete("This handler was introducing bugs and we restored the built in Tika handerl. Please do not use unless you know what you are doing.", true)]
public class StreamContentHandler : ContentHandler
{
private readonly StreamWriter _writer;
Expand Down
21 changes: 18 additions & 3 deletions src/TikaOnDotnet.TextExtractor/Stream/StreamTextExtractor.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
using System;
using System.Linq;
using java.io;
using javax.xml.transform;
using javax.xml.transform.sax;
using javax.xml.transform.stream;
using org.apache.tika.metadata;
using org.apache.tika.parser;
using Parser = org.apache.tika.parser.Parser;
Expand All @@ -16,17 +19,16 @@ public ExtractionResult Extract(Func<Metadata, InputStream> streamFactory, Syste
var parser = new AutoDetectParser();
var metadata = new Metadata();
var parseContext = new ParseContext();
var handler = GetTransformerHandler(outputStream);

//use the base class type for the key or parts of Tika won't find a usable parser
parseContext.set(typeof(Parser), parser);

var contentHandlerResult = new StreamContentHandler(outputStream);

using (var inputStream = streamFactory(metadata))
{
try
{
parser.parse(inputStream, contentHandlerResult, metadata, parseContext);
parser.parse(inputStream, handler, metadata, parseContext);
}
finally
{
Expand All @@ -42,6 +44,19 @@ public ExtractionResult Extract(Func<Metadata, InputStream> streamFactory, Syste
}
}

private static TransformerHandler GetTransformerHandler(System.IO.Stream outputStream)
{
var factory = (SAXTransformerFactory)TransformerFactory.newInstance();
var transformerHandler = factory.newTransformerHandler();

transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");

var outputAdapter = new DotNetOutputStreamAdapter(outputStream);
transformerHandler.setResult(new StreamResult(outputAdapter));
return transformerHandler;
}

private static ExtractionResult AssembleExtractionResult(Metadata metadata)
{
var metaDataResult = metadata.names()
Expand Down
12 changes: 7 additions & 5 deletions src/TikaOnDotnet.TextExtractor/TextExtractor.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.IO;
using System.Net;
using java.io;
using org.apache.tika.io;
using org.apache.tika.metadata;
Expand Down Expand Up @@ -35,13 +36,13 @@ public TextExtractionResult Extract(byte[] data)

public TextExtractionResult Extract(Uri uri)
{
var jUri = new java.net.URI(uri.ToString());
return Extract(metadata =>
{
var result = TikaInputStream.get(jUri, metadata);
metadata.add("Uri", uri.ToString());
return result;
});
metadata.add("Uri", uri.ToString());
var pageBytes = new WebClient().DownloadData(uri);

return TikaInputStream.get(pageBytes, metadata);
});
}

public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory)
Expand All @@ -50,6 +51,7 @@ public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory)
using (var outputStream = new MemoryStream())
{
var streamResult = streamExtractor.Extract(streamFactory, outputStream);
outputStream.Position = 0;

using (var reader = new StreamReader(outputStream))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
<Compile Include="..\..\SolutionInfo.cs">
<Link>Properties\SolutionInfo.cs</Link>
</Compile>
<Compile Include="Stream\DotNetOutputStreamAdapter.cs" />
<Compile Include="Stream\ExtractionResult.cs" />
<Compile Include="Stream\StreamContentHandler.cs" />
<Compile Include="Stream\StreamTextExtractor.cs" />
Expand Down

0 comments on commit 78bcaed

Please sign in to comment.