From 0133d86e9248481664cc8dd09dd9fd2d47e86fd3 Mon Sep 17 00:00:00 2001 From: shravan-lrm <124795004+shravan-lrm@users.noreply.github.com> Date: Thu, 21 Mar 2024 11:23:31 +0530 Subject: [PATCH] Create sinequa_webcrawler_base_template.xml Sinequa base template for webcrawler --- scraper/sinequa_webcrawler_base_template.xml | 310 +++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 scraper/sinequa_webcrawler_base_template.xml diff --git a/scraper/sinequa_webcrawler_base_template.xml b/scraper/sinequa_webcrawler_base_template.xml new file mode 100644 index 00000000..6db8d54c --- /dev/null +++ b/scraper/sinequa_webcrawler_base_template.xml @@ -0,0 +1,310 @@ + + + Default crawler to create a URL candidate list + crawler2 + + + + + + false + + + rtf;jy;xml;ico;gz;act;txt;avi;mp4;mp3;zip;py + + + + + + + false + false + false + + + true + + + + + + false + false + false + 0 + + + + + false + + true + false + + + false + false + false + false + true + false + + + + false + false + true + + + false + false + false + + + + + + + + + + + + + + + + false + true + true + false + false + false + + + + false + false + true + false + + + true + false + + + + true + false + false + false + false + false + false + false + false + false + false + false + true + true + false + false + false + false + true + false + + false + false + false + + + + + + + + + false + + + + + + false + + + + + + false + + + + + + false + + false + + + + + + + + + + true + false + + + + false + + false + true + false + + true + + + + + + false + false + false + + + + + + + + + false + true + + + + + false + + + + + + + + + true + true + + + false + + + + + + + + eu-west-1 + + + true + + true + + 80 + true + false + + + + + + + 8 + + + + true + true + true + 100 + 100000 + 100000 + 10 + -1 + -1 + true + false + false + true + true + false + true + true + false + true + true + true + true + false + 1 + 0 ms + true + no + false + + false + false + false + false + false + + + false + true + true + true + false + false + true + false + false + false + false + false + false + false + + + + true + true + + + + + + + + false + 1 + false + + + + + id + doc.url1 + + + + + +