diff --git a/NugetPage.md b/NugetPage.md new file mode 100644 index 0000000..c7240ea --- /dev/null +++ b/NugetPage.md @@ -0,0 +1,66 @@ +# SimpleSpider + +A simple and modular web spider written in C# .Net Core + + ![.NET Core](https://github.com/RafaelEstevamReis/SimpleSpider/workflows/.NET%20Core/badge.svg) + [![NuGet](https://buildstats.info/nuget/Net.RafaelEstevam.Spider.Simple.Lib)](http://nuget.org/packages/Net.RafaelEstevam.Spider.Simple.Lib) + +# Content +- [SimpleSpider](#simplespider) +- [Content](#content) +- [Some advantages](#some-advantages) +- [Getting started](#getting-started) +- [Samples](#samples) + +## Some advantages + +* Very simple to use and operate, ideal for lots of small projects or personal ones +* Easy html filter with [HObject](https://github.com/RafaelEstevamReis/SimpleSpider/blob/master/Simple.Test/Sample/QuotesToScrape_HObject.cs) (a HtmlNode wrap with use similar to JObject) +* Internal conversion from html to XElement, no need to external tools on use +* Automatic Json parser to JObject +* Automatic Json deserialize +* Modular Parser engine (you can add your own parsers!) + * JSON and XML already included +* Modular Caching engine (you can add your own!) + * Stand alone Cache engine included, no need to external softwares +* Modular Downloader engine (you can add your own!) + * WebClient with cookies or HttpClient download engine included + +Easy **import with [NuGet](https://www.nuget.org/packages/Net.RafaelEstevam.Spider.Simple.Lib)** + +## Getting started + +1. Start a new console project and add Nuget Reference +2. PM> Install-Package Net.RafaelEstevam.Spider.Simple.Lib +3. Create a class for your spider (or leave in program) +4. create a new instance of SimpleSpider + 1. Give it a name, cache and log will be saved with that name + 2. Give it a domain (your spider will not fleet from it) +5. Add a event `FetchCompleted` to +6. Optionally give a first page with `AddPage`. If omitted, it will use the home page of the domain +7. Call `Execute()` + +``` C# +void run() +{ + var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); + // Set the completed event to implement your stuff + spider.FetchCompleted += fetchCompleted_items; + // execute + spider.Execute(); +} +void fetchCompleted_items(object Sender, FetchCompleteEventArgs args) +{ + // walk around ... + // TIP: inspect args to see stuff + + var hObj = args.GetHObject(); + string[] quotes = hObj["span > .text"]; +} +``` + +> TIP: Use the [Simple.Tests](https://github.com/RafaelEstevamReis/SimpleSpider/tree/master/Simple.Test/Sample) project to see examples and poke around + +## Samples + +See all samples at [Simple.Tests](https://github.com/RafaelEstevamReis/SimpleSpider/tree/master/Simple.Test/Sample) \ No newline at end of file diff --git a/README.md b/README.md index eba124c..bf370d5 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,8 @@ void fetchCompleted_items(object Sender, FetchCompleteEventArgs args) // walk around ... // TIP: inspect args to see stuff - // Two good starts: XElement and HObject - var XElement = args.GetXElement(); var hObj = args.GetHObject(); + string[] quotes = hObj["span > .text"]; } ``` diff --git a/Simple.Lib/Simple.Lib.csproj b/Simple.Lib/Simple.Lib.csproj index 6f44f35..c977f77 100644 --- a/Simple.Lib/Simple.Lib.csproj +++ b/Simple.Lib/Simple.Lib.csproj @@ -12,16 +12,16 @@ https://github.com/RafaelEstevamReis/SimpleSpider https://github.com/RafaelEstevamReis/SimpleSpider git - spider + spider web crawling false - 0.5.266 + 0.5.272 true - A simple to use web spider with an example rich GitHub repository + A simple to use spider for web crawling with an example rich GitHub repository (work in progress) Work in progress. See examples and documentation on GitHub page -Better redirect handling, added HObject support -Commit 29d3a44 - 0.5.266.0 +Improved performance +Commit 2bd7336 + 0.5.272.0 diff --git a/SimpleSpider.sln b/SimpleSpider.sln index 69e58ac..210aea6 100644 --- a/SimpleSpider.sln +++ b/SimpleSpider.sln @@ -10,10 +10,11 @@ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Text", "Text", "{78F66FBC-1B52-4851-89E3-798D6E6AE310}" ProjectSection(SolutionItems) = preProject LICENSE = LICENSE + NugetPage.md = NugetPage.md README.md = README.md EndProjectSection EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Simple.UnitTests", "Simple.UnitTests\Simple.UnitTests.csproj", "{A7EB8974-B50F-499A-BAAA-53C2CB33E08C}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Simple.UnitTests", "Simple.UnitTests\Simple.UnitTests.csproj", "{A7EB8974-B50F-499A-BAAA-53C2CB33E08C}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution