Skip to content

Commit

Permalink
Create sinequa_webcrawler_base_template.xml
Browse files Browse the repository at this point in the history
Sinequa base template for webcrawler
  • Loading branch information
shravan-lrm authored Mar 21, 2024
1 parent 89464fb commit 0133d86
Showing 1 changed file with 310 additions and 0 deletions.
310 changes: 310 additions & 0 deletions scraper/sinequa_webcrawler_base_template.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
<?xml version="1.0" encoding="utf-8"?>
<Sinequa>
<Description>Default crawler to create a URL candidate list</Description>
<Connector>crawler2</Connector>
<Identity></Identity>
<Indexers></Indexers>
<Index></Index>
<Domain></Domain>
<TreeRoot></TreeRoot>
<ForceReindexation>false</ForceReindexation>
<Plugin></Plugin>
<IncludedExtensions></IncludedExtensions>
<ExcludedExtensions>rtf;jy;xml;ico;gz;act;txt;avi;mp4;mp3;zip;py</ExcludedExtensions>
<IncludedFilenames></IncludedFilenames>
<ExcludedFilenames></ExcludedFilenames>
<IncludedFolders></IncludedFolders>
<ExcludedFolders></ExcludedFolders>
<Indexation>
<Mappings></Mappings>
<SimulateLemma>false</SimulateLemma>
<SimulateEngine>false</SimulateEngine>
<SimulateCache>false</SimulateCache>
<SimulateLemmaMin></SimulateLemmaMin>
<SimulateLemmaMax></SimulateLemmaMax>
<EngineMetaEnabled>true</EngineMetaEnabled>
<ThumbnailHeight></ThumbnailHeight>
<ThumbnailWidth></ThumbnailWidth>
<ThumbnailSmallTimeout></ThumbnailSmallTimeout>
<ThumbnailMediumTimeout></ThumbnailMediumTimeout>
<ThumbnailLargeTimeout></ThumbnailLargeTimeout>
<SynchThumbnailGen>false</SynchThumbnailGen>
<StoreInCollectionCache>false</StoreInCollectionCache>
<GetFilePropertiesFromConverter>false</GetFilePropertiesFromConverter>
<CollectionStateParallelRowFetch>0</CollectionStateParallelRowFetch>
</Indexation>
<System>
<LogLevel></LogLevel>
</System>
<DisplayLongProperties>false</DisplayLongProperties>
<LongPropertyLimit></LongPropertyLimit>
<UsePerformanceMetrics>true</UsePerformanceMetrics>
<LogPerformanceMetricsPeriodically>false</LogPerformanceMetricsPeriodically>
<PasswordRepository></PasswordRepository>
<StoreDocumentCache></StoreDocumentCache>
<AuditEnabled>false</AuditEnabled>
<SaveDeniedDocs>false</SaveDeniedDocs>
<SavePropertiesToRegistry>false</SavePropertiesToRegistry>
<CollectionStateNative>false</CollectionStateNative>
<HtmlNavigatorNative>true</HtmlNavigatorNative>
<XPathNavigatorNative>false</XPathNavigatorNative>
<StatusMaxOk></StatusMaxOk>
<DelApiSecret></DelApiSecret>
<IndexerClient>
<Simulate>false</Simulate>
<SimulateGetCollectionState>false</SimulateGetCollectionState>
<New>true</New>
<SendThreadCount></SendThreadCount>
<QueueMaxCount></QueueMaxCount>
<DirectFileAccess>false</DirectFileAccess>
<UseCompression>false</UseCompression>
<SessionIsFinishedWait>false</SessionIsFinishedWait>
<SendTimeout></SendTimeout>
<ReceiveTimeout></ReceiveTimeout>
<RetryConnectCount></RetryConnectCount>
<RetryConnectDelay></RetryConnectDelay>
<SleepQueueFull></SleepQueueFull>
<SleepQueueFullCount></SleepQueueFullCount>
<SleepQueueFullQuick></SleepQueueFullQuick>
<SleepQueueFullQuickCount></SleepQueueFullQuickCount>
<SleepCheckOpen></SleepCheckOpen>
<SleepCheckOpenCount></SleepCheckOpenCount>
<SleepCheckOpenQuick></SleepCheckOpenQuick>
<SleepCheckOpenQuickCount></SleepCheckOpenQuickCount>
<DeactivationTimeout></DeactivationTimeout>
<BackToSendingQueueCount></BackToSendingQueueCount>
</IndexerClient>
<ForceBlobSend>false</ForceBlobSend>
<ContinueOnError>true</ContinueOnError>
<DoDelete>true</DoDelete>
<DeleteOnError>false</DeleteOnError>
<DeleteOnEnumerationError>false</DeleteOnEnumerationError>
<AcceptDeleteAll>false</AcceptDeleteAll>
<DeleteMaxPercentThreshold></DeleteMaxPercentThreshold>
<DeleteMaxThreshold></DeleteMaxThreshold>
<DeleteMinRemainingThreshold></DeleteMinRemainingThreshold>
<SaveCollectionState>false</SaveCollectionState>
<IncrementalState>false</IncrementalState>
<RealTimeIncrementalState>true</RealTimeIncrementalState>
<RealTimeInfoOnError>false</RealTimeInfoOnError>
<ConversionProxies></ConversionProxies>
<ConversionPlan></ConversionPlan>
<AddBaseHref>true</AddBaseHref>
<AddMetaContentType>false</AddMetaContentType>
<Throttle></Throttle>
<DocumentClass></DocumentClass>
<ConnectorLanguage></ConnectorLanguage>
<ClearHttpRequestCanonicalizeAsFilePath>true</ClearHttpRequestCanonicalizeAsFilePath>
<IndexZipContent>false</IndexZipContent>
<IndexPdfAttachments>false</IndexPdfAttachments>
<IndexOleAttachments>false</IndexOleAttachments>
<IndexMsgContent>false</IndexMsgContent>
<IndexMsgAttachments>false</IndexMsgAttachments>
<IndexOftContent>false</IndexOftContent>
<IndexOftAttachments>false</IndexOftAttachments>
<IndexEmlContent>false</IndexEmlContent>
<IndexEmlAttachments>false</IndexEmlAttachments>
<IndexPstContent>false</IndexPstContent>
<IndexOstContent>false</IndexOstContent>
<IndexPstMsg>true</IndexPstMsg>
<IndexPstMsgAttachments>true</IndexPstMsgAttachments>
<IndexPstContact>false</IndexPstContact>
<IndexPstCalendar>false</IndexPstCalendar>
<IndexPstNote>false</IndexPstNote>
<IndexPstTask>false</IndexPstTask>
<IndexPstDocument>true</IndexPstDocument>
<PstUseSafeId>false</PstUseSafeId>
<IndexArchivesExtensions></IndexArchivesExtensions>
<ArchiveItemsUseArchiveVersion>false</ArchiveItemsUseArchiveVersion>
<UseShortAttachmentId>false</UseShortAttachmentId>
<UseExtendedExtensionGuesser>false</UseExtendedExtensionGuesser>
<XmpExtensions></XmpExtensions>
<MediaExtensions></MediaExtensions>
<ExiftoolExtensions></ExiftoolExtensions>
<EarlySelectionQuery></EarlySelectionQuery>
<SelectionQuery></SelectionQuery>
<AttachmentSelectionQuery></AttachmentSelectionQuery>
<ArchiveItemSelectionQuery></ArchiveItemSelectionQuery>
<EngineConnectionWait></EngineConnectionWait>
<CalculateGraphBoost>false</CalculateGraphBoost>
<GraphBoostColumn></GraphBoostColumn>
<GraphBoostEMColumn></GraphBoostEMColumn>
<GraphBoostIterations></GraphBoostIterations>
<GraphBoostPower></GraphBoostPower>
<GraphBoostAdd></GraphBoostAdd>
<UseFieldPermissions>false</UseFieldPermissions>
<ShardIndexes></ShardIndexes>
<ShardingStrategy></ShardingStrategy>
<ShardSelections></ShardSelections>
<CurationType></CurationType>
<CurationIdPattern></CurationIdPattern>
<RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
<Namespace></Namespace>
<Json>
<MasterFileToIndex></MasterFileToIndex>
<MasterUrlToIndex></MasterUrlToIndex>
<FileToIndexSep></FileToIndexSep>
<FileToIndexUseMasterVersion>false</FileToIndexUseMasterVersion>
<UrlToIndexSep></UrlToIndexSep>
<UrlToIndexUseMasterVersion>false</UrlToIndexUseMasterVersion>
</Json>
<FSIncludedExtensions></FSIncludedExtensions>
<FSExcludedExtensions></FSExcludedExtensions>
<FSIncludedFilenames></FSIncludedFilenames>
<FSExcludedFilenames></FSExcludedFilenames>
<FSIncludedFolders></FSIncludedFolders>
<FSExcludedFolders></FSExcludedFolders>
<HtmlBuilderType></HtmlBuilderType>
<UrlAccess>
<UseDefaultCredentials>true</UseDefaultCredentials>
<UseDefaultNetworkCredentials>false</UseDefaultNetworkCredentials>
<User></User>
<Password></Password>
<Domain></Domain>
<UseRfc1945>false</UseRfc1945>
<Timeout></Timeout>
<ChangeConnectionGroupNameOnTimeout>false</ChangeConnectionGroupNameOnTimeout>
<AllowAuthenticatedConnectionSharing>true</AllowAuthenticatedConnectionSharing>
<PreAuthenticate>false</PreAuthenticate>
<HttpVersion></HttpVersion>
<KeepAlive>true</KeepAlive>
<SecurityProtocol></SecurityProtocol>
<UserAgent></UserAgent>
<ClientCertificateFile></ClientCertificateFile>
<ClientCertificatePassword></ClientCertificatePassword>
<ClientCertificateStorage></ClientCertificateStorage>
<AllowXPathCookies>false</AllowXPathCookies>
<UseHttpClientForWebRequests>false</UseHttpClientForWebRequests>
<UseBrowserForWebRequests>false</UseBrowserForWebRequests>
<BrowserForWebRequestsReadinessThreshold></BrowserForWebRequestsReadinessThreshold>
<BrowserForWebRequestsInitialDelay></BrowserForWebRequestsInitialDelay>
<BrowserForWebRequestsMaxTotalDelay></BrowserForWebRequestsMaxTotalDelay>
<BrowserForWebRequestsMaxResourcesDelay></BrowserForWebRequestsMaxResourcesDelay>
<BrowserForWebRequestsLogLevel></BrowserForWebRequestsLogLevel>
<WebConnectionPluginName></WebConnectionPluginName>
<PostLoginUrl></PostLoginUrl>
<PostLoginData></PostLoginData>
<GetBeforePostLogin>false</GetBeforePostLogin>
<PostLoginAutoRedirect>true</PostLoginAutoRedirect>
<ReLoginCount></ReLoginCount>
<ReLoginDelay></ReLoginDelay>
<DetectHtmlLoginPattern></DetectHtmlLoginPattern>
<BrowserLogin>
<Activate>false</Activate>
<RemoteDebuggingPort></RemoteDebuggingPort>
<BrowserLogLevel></BrowserLogLevel>
<SuccessCondition></SuccessCondition>
<CookieFilter></CookieFilter>
</BrowserLogin>
<FtpUser></FtpUser>
<FtpPassword></FtpPassword>
<FtpDomain></FtpDomain>
<FtpUseBinary>true</FtpUseBinary>
<FtpUsePassive>true</FtpUsePassive>
<FtpReadWriteTimeout></FtpReadWriteTimeout>
<FtpTimeout></FtpTimeout>
<FtpEnableSsl>false</FtpEnableSsl>
<FileUser></FileUser>
<FilePassword></FilePassword>
<FileDomain></FileDomain>
<FileTimeout></FileTimeout>
<AmazonS3>
<AccessKey></AccessKey>
<SecretKey></SecretKey>
<RegionEndpoint>eu-west-1</RegionEndpoint>
<ServiceURL></ServiceURL>
</AmazonS3>
<ProxyAutoDetect>true</ProxyAutoDetect>
<ProxyAddress></ProxyAddress>
<ProxyBypassOnLocal>true</ProxyBypassOnLocal>
<ProxyServer></ProxyServer>
<ProxyPort>80</ProxyPort>
<ProxyUseDefaultCredentials>true</ProxyUseDefaultCredentials>
<ProxyUseDefaultNetworkCredentials>false</ProxyUseDefaultNetworkCredentials>
<ProxyUser></ProxyUser>
<ProxyPassword></ProxyPassword>
<ProxyDomain></ProxyDomain>
<BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth>
<BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight>
</UrlAccess>
<WorkerCount>8</WorkerCount>
<MaxWorkerPerHost></MaxWorkerPerHost>
<UrlList></UrlList>
<DynamicUrlList></DynamicUrlList>
<UrlStayInside>true</UrlStayInside>
<UrlRefererStayInside>true</UrlRefererStayInside>
<FollowLinks>true</FollowLinks>
<MaxLevel>100</MaxLevel>
<MaxToIndex>100000</MaxToIndex>
<MaxToCrawl>100000</MaxToCrawl>
<MaxRedirection>10</MaxRedirection>
<CrawlMaxSize>-1</CrawlMaxSize>
<CrawlTimeout>-1</CrawlTimeout>
<NormalizeUrls>true</NormalizeUrls>
<CorrectDomainCookies>false</CorrectDomainCookies>
<IgnoreSessionCookies>false</IgnoreSessionCookies>
<DownloadImages>true</DownloadImages>
<DownloadMedia>true</DownloadMedia>
<DownloadCss>false</DownloadCss>
<DownloadFtp>true</DownloadFtp>
<DownloadFile>true</DownloadFile>
<IndexJs>false</IndexJs>
<FollowJs>true</FollowJs>
<CrawlFlash>true</CrawlFlash>
<IndexEmptyPages>true</IndexEmptyPages>
<CrawlWebsphereSeedlist>true</CrawlWebsphereSeedlist>
<KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
<RetryCount>1</RetryCount>
<RetryPause>0 ms</RetryPause>
<UseIfModifiedSince>true</UseIfModifiedSince>
<UseIfNoneMatch>no</UseIfNoneMatch>
<AcceptWeakETag>false</AcceptWeakETag>
<ForcedEncoding></ForcedEncoding>
<UseCompression>false</UseCompression>
<UseUnsafeHeaderParsing>false</UseUnsafeHeaderParsing>
<NormalizeSecureSchemesWhenTestingVisited>false</NormalizeSecureSchemesWhenTestingVisited>
<ExactDeduplication>false</ExactDeduplication>
<NearDeduplication>false</NearDeduplication>
<CrawlPauseDelay></CrawlPauseDelay>
<CrawlPauseCount></CrawlPauseCount>
<UseRuntimeAutoRedirect>false</UseRuntimeAutoRedirect>
<RememberDnsFailure>true</RememberDnsFailure>
<RememberConnectFailure>true</RememberConnectFailure>
<RememberTrustFailure>true</RememberTrustFailure>
<RememberProxyNameResolutionFailure>false</RememberProxyNameResolutionFailure>
<UseRobotsNoIndex>false</UseRobotsNoIndex>
<UseRobotsNoFollow>true</UseRobotsNoFollow>
<UseRobotsTxt>false</UseRobotsTxt>
<RobotsTxtCaseSensitive>false</RobotsTxtCaseSensitive>
<LoadRobotsTxtSitemapUrls>false</LoadRobotsTxtSitemapUrls>
<CheckSitemapUrlLastmodInRealtimeMode>false</CheckSitemapUrlLastmodInRealtimeMode>
<AddRobotsTxtAllowUrlsToSeedList>false</AddRobotsTxtAllowUrlsToSeedList>
<UseCanonicalLinks>false</UseCanonicalLinks>
<UseRelNoFollow>false</UseRelNoFollow>
<DownloadSelectionQuery></DownloadSelectionQuery>
<FollowSelectionQuery></FollowSelectionQuery>
<IndexSelectionQuery></IndexSelectionQuery>
<LoadDefaultTags>true</LoadDefaultTags>
<LoadDefaultJsTransforms>true</LoadDefaultJsTransforms>
<visibility></visibility>
<PdfGen>
<ConverterType></ConverterType>
<TimeoutSmall></TimeoutSmall>
<TimeoutMedium></TimeoutMedium>
<TimeoutLarge></TimeoutLarge>
</PdfGen>
<DeleteOnNetworkOrServerError>false</DeleteOnNetworkOrServerError>
<Revision>1</Revision>
<EnableNeuralIndexing>false</EnableNeuralIndexing>
<NeuralSearchSelectionQuery></NeuralSearchSelectionQuery>
<HttpCodesToRetry></HttpCodesToRetry>
<LogPerformanceMetricsPeriod></LogPerformanceMetricsPeriod>
<Mapping>
<Name>id</Name>
<Value>doc.url1</Value>
<Description></Description>
<Selection></Selection>
<DefaultValue></DefaultValue>
</Mapping>
<DocCountLimitOnCollectProperties></DocCountLimitOnCollectProperties>
</Sinequa>

0 comments on commit 0133d86

Please sign in to comment.