-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create sinequa_webcrawler_base_template.xml
Sinequa base template for webcrawler
- Loading branch information
1 parent
89464fb
commit 0133d86
Showing
1 changed file
with
310 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,310 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<Sinequa> | ||
<Description>Default crawler to create a URL candidate list</Description> | ||
<Connector>crawler2</Connector> | ||
<Identity></Identity> | ||
<Indexers></Indexers> | ||
<Index></Index> | ||
<Domain></Domain> | ||
<TreeRoot></TreeRoot> | ||
<ForceReindexation>false</ForceReindexation> | ||
<Plugin></Plugin> | ||
<IncludedExtensions></IncludedExtensions> | ||
<ExcludedExtensions>rtf;jy;xml;ico;gz;act;txt;avi;mp4;mp3;zip;py</ExcludedExtensions> | ||
<IncludedFilenames></IncludedFilenames> | ||
<ExcludedFilenames></ExcludedFilenames> | ||
<IncludedFolders></IncludedFolders> | ||
<ExcludedFolders></ExcludedFolders> | ||
<Indexation> | ||
<Mappings></Mappings> | ||
<SimulateLemma>false</SimulateLemma> | ||
<SimulateEngine>false</SimulateEngine> | ||
<SimulateCache>false</SimulateCache> | ||
<SimulateLemmaMin></SimulateLemmaMin> | ||
<SimulateLemmaMax></SimulateLemmaMax> | ||
<EngineMetaEnabled>true</EngineMetaEnabled> | ||
<ThumbnailHeight></ThumbnailHeight> | ||
<ThumbnailWidth></ThumbnailWidth> | ||
<ThumbnailSmallTimeout></ThumbnailSmallTimeout> | ||
<ThumbnailMediumTimeout></ThumbnailMediumTimeout> | ||
<ThumbnailLargeTimeout></ThumbnailLargeTimeout> | ||
<SynchThumbnailGen>false</SynchThumbnailGen> | ||
<StoreInCollectionCache>false</StoreInCollectionCache> | ||
<GetFilePropertiesFromConverter>false</GetFilePropertiesFromConverter> | ||
<CollectionStateParallelRowFetch>0</CollectionStateParallelRowFetch> | ||
</Indexation> | ||
<System> | ||
<LogLevel></LogLevel> | ||
</System> | ||
<DisplayLongProperties>false</DisplayLongProperties> | ||
<LongPropertyLimit></LongPropertyLimit> | ||
<UsePerformanceMetrics>true</UsePerformanceMetrics> | ||
<LogPerformanceMetricsPeriodically>false</LogPerformanceMetricsPeriodically> | ||
<PasswordRepository></PasswordRepository> | ||
<StoreDocumentCache></StoreDocumentCache> | ||
<AuditEnabled>false</AuditEnabled> | ||
<SaveDeniedDocs>false</SaveDeniedDocs> | ||
<SavePropertiesToRegistry>false</SavePropertiesToRegistry> | ||
<CollectionStateNative>false</CollectionStateNative> | ||
<HtmlNavigatorNative>true</HtmlNavigatorNative> | ||
<XPathNavigatorNative>false</XPathNavigatorNative> | ||
<StatusMaxOk></StatusMaxOk> | ||
<DelApiSecret></DelApiSecret> | ||
<IndexerClient> | ||
<Simulate>false</Simulate> | ||
<SimulateGetCollectionState>false</SimulateGetCollectionState> | ||
<New>true</New> | ||
<SendThreadCount></SendThreadCount> | ||
<QueueMaxCount></QueueMaxCount> | ||
<DirectFileAccess>false</DirectFileAccess> | ||
<UseCompression>false</UseCompression> | ||
<SessionIsFinishedWait>false</SessionIsFinishedWait> | ||
<SendTimeout></SendTimeout> | ||
<ReceiveTimeout></ReceiveTimeout> | ||
<RetryConnectCount></RetryConnectCount> | ||
<RetryConnectDelay></RetryConnectDelay> | ||
<SleepQueueFull></SleepQueueFull> | ||
<SleepQueueFullCount></SleepQueueFullCount> | ||
<SleepQueueFullQuick></SleepQueueFullQuick> | ||
<SleepQueueFullQuickCount></SleepQueueFullQuickCount> | ||
<SleepCheckOpen></SleepCheckOpen> | ||
<SleepCheckOpenCount></SleepCheckOpenCount> | ||
<SleepCheckOpenQuick></SleepCheckOpenQuick> | ||
<SleepCheckOpenQuickCount></SleepCheckOpenQuickCount> | ||
<DeactivationTimeout></DeactivationTimeout> | ||
<BackToSendingQueueCount></BackToSendingQueueCount> | ||
</IndexerClient> | ||
<ForceBlobSend>false</ForceBlobSend> | ||
<ContinueOnError>true</ContinueOnError> | ||
<DoDelete>true</DoDelete> | ||
<DeleteOnError>false</DeleteOnError> | ||
<DeleteOnEnumerationError>false</DeleteOnEnumerationError> | ||
<AcceptDeleteAll>false</AcceptDeleteAll> | ||
<DeleteMaxPercentThreshold></DeleteMaxPercentThreshold> | ||
<DeleteMaxThreshold></DeleteMaxThreshold> | ||
<DeleteMinRemainingThreshold></DeleteMinRemainingThreshold> | ||
<SaveCollectionState>false</SaveCollectionState> | ||
<IncrementalState>false</IncrementalState> | ||
<RealTimeIncrementalState>true</RealTimeIncrementalState> | ||
<RealTimeInfoOnError>false</RealTimeInfoOnError> | ||
<ConversionProxies></ConversionProxies> | ||
<ConversionPlan></ConversionPlan> | ||
<AddBaseHref>true</AddBaseHref> | ||
<AddMetaContentType>false</AddMetaContentType> | ||
<Throttle></Throttle> | ||
<DocumentClass></DocumentClass> | ||
<ConnectorLanguage></ConnectorLanguage> | ||
<ClearHttpRequestCanonicalizeAsFilePath>true</ClearHttpRequestCanonicalizeAsFilePath> | ||
<IndexZipContent>false</IndexZipContent> | ||
<IndexPdfAttachments>false</IndexPdfAttachments> | ||
<IndexOleAttachments>false</IndexOleAttachments> | ||
<IndexMsgContent>false</IndexMsgContent> | ||
<IndexMsgAttachments>false</IndexMsgAttachments> | ||
<IndexOftContent>false</IndexOftContent> | ||
<IndexOftAttachments>false</IndexOftAttachments> | ||
<IndexEmlContent>false</IndexEmlContent> | ||
<IndexEmlAttachments>false</IndexEmlAttachments> | ||
<IndexPstContent>false</IndexPstContent> | ||
<IndexOstContent>false</IndexOstContent> | ||
<IndexPstMsg>true</IndexPstMsg> | ||
<IndexPstMsgAttachments>true</IndexPstMsgAttachments> | ||
<IndexPstContact>false</IndexPstContact> | ||
<IndexPstCalendar>false</IndexPstCalendar> | ||
<IndexPstNote>false</IndexPstNote> | ||
<IndexPstTask>false</IndexPstTask> | ||
<IndexPstDocument>true</IndexPstDocument> | ||
<PstUseSafeId>false</PstUseSafeId> | ||
<IndexArchivesExtensions></IndexArchivesExtensions> | ||
<ArchiveItemsUseArchiveVersion>false</ArchiveItemsUseArchiveVersion> | ||
<UseShortAttachmentId>false</UseShortAttachmentId> | ||
<UseExtendedExtensionGuesser>false</UseExtendedExtensionGuesser> | ||
<XmpExtensions></XmpExtensions> | ||
<MediaExtensions></MediaExtensions> | ||
<ExiftoolExtensions></ExiftoolExtensions> | ||
<EarlySelectionQuery></EarlySelectionQuery> | ||
<SelectionQuery></SelectionQuery> | ||
<AttachmentSelectionQuery></AttachmentSelectionQuery> | ||
<ArchiveItemSelectionQuery></ArchiveItemSelectionQuery> | ||
<EngineConnectionWait></EngineConnectionWait> | ||
<CalculateGraphBoost>false</CalculateGraphBoost> | ||
<GraphBoostColumn></GraphBoostColumn> | ||
<GraphBoostEMColumn></GraphBoostEMColumn> | ||
<GraphBoostIterations></GraphBoostIterations> | ||
<GraphBoostPower></GraphBoostPower> | ||
<GraphBoostAdd></GraphBoostAdd> | ||
<UseFieldPermissions>false</UseFieldPermissions> | ||
<ShardIndexes></ShardIndexes> | ||
<ShardingStrategy></ShardingStrategy> | ||
<ShardSelections></ShardSelections> | ||
<CurationType></CurationType> | ||
<CurationIdPattern></CurationIdPattern> | ||
<RunIndexMiningInIndexer>false</RunIndexMiningInIndexer> | ||
<Namespace></Namespace> | ||
<Json> | ||
<MasterFileToIndex></MasterFileToIndex> | ||
<MasterUrlToIndex></MasterUrlToIndex> | ||
<FileToIndexSep></FileToIndexSep> | ||
<FileToIndexUseMasterVersion>false</FileToIndexUseMasterVersion> | ||
<UrlToIndexSep></UrlToIndexSep> | ||
<UrlToIndexUseMasterVersion>false</UrlToIndexUseMasterVersion> | ||
</Json> | ||
<FSIncludedExtensions></FSIncludedExtensions> | ||
<FSExcludedExtensions></FSExcludedExtensions> | ||
<FSIncludedFilenames></FSIncludedFilenames> | ||
<FSExcludedFilenames></FSExcludedFilenames> | ||
<FSIncludedFolders></FSIncludedFolders> | ||
<FSExcludedFolders></FSExcludedFolders> | ||
<HtmlBuilderType></HtmlBuilderType> | ||
<UrlAccess> | ||
<UseDefaultCredentials>true</UseDefaultCredentials> | ||
<UseDefaultNetworkCredentials>false</UseDefaultNetworkCredentials> | ||
<User></User> | ||
<Password></Password> | ||
<Domain></Domain> | ||
<UseRfc1945>false</UseRfc1945> | ||
<Timeout></Timeout> | ||
<ChangeConnectionGroupNameOnTimeout>false</ChangeConnectionGroupNameOnTimeout> | ||
<AllowAuthenticatedConnectionSharing>true</AllowAuthenticatedConnectionSharing> | ||
<PreAuthenticate>false</PreAuthenticate> | ||
<HttpVersion></HttpVersion> | ||
<KeepAlive>true</KeepAlive> | ||
<SecurityProtocol></SecurityProtocol> | ||
<UserAgent></UserAgent> | ||
<ClientCertificateFile></ClientCertificateFile> | ||
<ClientCertificatePassword></ClientCertificatePassword> | ||
<ClientCertificateStorage></ClientCertificateStorage> | ||
<AllowXPathCookies>false</AllowXPathCookies> | ||
<UseHttpClientForWebRequests>false</UseHttpClientForWebRequests> | ||
<UseBrowserForWebRequests>false</UseBrowserForWebRequests> | ||
<BrowserForWebRequestsReadinessThreshold></BrowserForWebRequestsReadinessThreshold> | ||
<BrowserForWebRequestsInitialDelay></BrowserForWebRequestsInitialDelay> | ||
<BrowserForWebRequestsMaxTotalDelay></BrowserForWebRequestsMaxTotalDelay> | ||
<BrowserForWebRequestsMaxResourcesDelay></BrowserForWebRequestsMaxResourcesDelay> | ||
<BrowserForWebRequestsLogLevel></BrowserForWebRequestsLogLevel> | ||
<WebConnectionPluginName></WebConnectionPluginName> | ||
<PostLoginUrl></PostLoginUrl> | ||
<PostLoginData></PostLoginData> | ||
<GetBeforePostLogin>false</GetBeforePostLogin> | ||
<PostLoginAutoRedirect>true</PostLoginAutoRedirect> | ||
<ReLoginCount></ReLoginCount> | ||
<ReLoginDelay></ReLoginDelay> | ||
<DetectHtmlLoginPattern></DetectHtmlLoginPattern> | ||
<BrowserLogin> | ||
<Activate>false</Activate> | ||
<RemoteDebuggingPort></RemoteDebuggingPort> | ||
<BrowserLogLevel></BrowserLogLevel> | ||
<SuccessCondition></SuccessCondition> | ||
<CookieFilter></CookieFilter> | ||
</BrowserLogin> | ||
<FtpUser></FtpUser> | ||
<FtpPassword></FtpPassword> | ||
<FtpDomain></FtpDomain> | ||
<FtpUseBinary>true</FtpUseBinary> | ||
<FtpUsePassive>true</FtpUsePassive> | ||
<FtpReadWriteTimeout></FtpReadWriteTimeout> | ||
<FtpTimeout></FtpTimeout> | ||
<FtpEnableSsl>false</FtpEnableSsl> | ||
<FileUser></FileUser> | ||
<FilePassword></FilePassword> | ||
<FileDomain></FileDomain> | ||
<FileTimeout></FileTimeout> | ||
<AmazonS3> | ||
<AccessKey></AccessKey> | ||
<SecretKey></SecretKey> | ||
<RegionEndpoint>eu-west-1</RegionEndpoint> | ||
<ServiceURL></ServiceURL> | ||
</AmazonS3> | ||
<ProxyAutoDetect>true</ProxyAutoDetect> | ||
<ProxyAddress></ProxyAddress> | ||
<ProxyBypassOnLocal>true</ProxyBypassOnLocal> | ||
<ProxyServer></ProxyServer> | ||
<ProxyPort>80</ProxyPort> | ||
<ProxyUseDefaultCredentials>true</ProxyUseDefaultCredentials> | ||
<ProxyUseDefaultNetworkCredentials>false</ProxyUseDefaultNetworkCredentials> | ||
<ProxyUser></ProxyUser> | ||
<ProxyPassword></ProxyPassword> | ||
<ProxyDomain></ProxyDomain> | ||
<BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth> | ||
<BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight> | ||
</UrlAccess> | ||
<WorkerCount>8</WorkerCount> | ||
<MaxWorkerPerHost></MaxWorkerPerHost> | ||
<UrlList></UrlList> | ||
<DynamicUrlList></DynamicUrlList> | ||
<UrlStayInside>true</UrlStayInside> | ||
<UrlRefererStayInside>true</UrlRefererStayInside> | ||
<FollowLinks>true</FollowLinks> | ||
<MaxLevel>100</MaxLevel> | ||
<MaxToIndex>100000</MaxToIndex> | ||
<MaxToCrawl>100000</MaxToCrawl> | ||
<MaxRedirection>10</MaxRedirection> | ||
<CrawlMaxSize>-1</CrawlMaxSize> | ||
<CrawlTimeout>-1</CrawlTimeout> | ||
<NormalizeUrls>true</NormalizeUrls> | ||
<CorrectDomainCookies>false</CorrectDomainCookies> | ||
<IgnoreSessionCookies>false</IgnoreSessionCookies> | ||
<DownloadImages>true</DownloadImages> | ||
<DownloadMedia>true</DownloadMedia> | ||
<DownloadCss>false</DownloadCss> | ||
<DownloadFtp>true</DownloadFtp> | ||
<DownloadFile>true</DownloadFile> | ||
<IndexJs>false</IndexJs> | ||
<FollowJs>true</FollowJs> | ||
<CrawlFlash>true</CrawlFlash> | ||
<IndexEmptyPages>true</IndexEmptyPages> | ||
<CrawlWebsphereSeedlist>true</CrawlWebsphereSeedlist> | ||
<KeepHashFragmentInUrl>false</KeepHashFragmentInUrl> | ||
<RetryCount>1</RetryCount> | ||
<RetryPause>0 ms</RetryPause> | ||
<UseIfModifiedSince>true</UseIfModifiedSince> | ||
<UseIfNoneMatch>no</UseIfNoneMatch> | ||
<AcceptWeakETag>false</AcceptWeakETag> | ||
<ForcedEncoding></ForcedEncoding> | ||
<UseCompression>false</UseCompression> | ||
<UseUnsafeHeaderParsing>false</UseUnsafeHeaderParsing> | ||
<NormalizeSecureSchemesWhenTestingVisited>false</NormalizeSecureSchemesWhenTestingVisited> | ||
<ExactDeduplication>false</ExactDeduplication> | ||
<NearDeduplication>false</NearDeduplication> | ||
<CrawlPauseDelay></CrawlPauseDelay> | ||
<CrawlPauseCount></CrawlPauseCount> | ||
<UseRuntimeAutoRedirect>false</UseRuntimeAutoRedirect> | ||
<RememberDnsFailure>true</RememberDnsFailure> | ||
<RememberConnectFailure>true</RememberConnectFailure> | ||
<RememberTrustFailure>true</RememberTrustFailure> | ||
<RememberProxyNameResolutionFailure>false</RememberProxyNameResolutionFailure> | ||
<UseRobotsNoIndex>false</UseRobotsNoIndex> | ||
<UseRobotsNoFollow>true</UseRobotsNoFollow> | ||
<UseRobotsTxt>false</UseRobotsTxt> | ||
<RobotsTxtCaseSensitive>false</RobotsTxtCaseSensitive> | ||
<LoadRobotsTxtSitemapUrls>false</LoadRobotsTxtSitemapUrls> | ||
<CheckSitemapUrlLastmodInRealtimeMode>false</CheckSitemapUrlLastmodInRealtimeMode> | ||
<AddRobotsTxtAllowUrlsToSeedList>false</AddRobotsTxtAllowUrlsToSeedList> | ||
<UseCanonicalLinks>false</UseCanonicalLinks> | ||
<UseRelNoFollow>false</UseRelNoFollow> | ||
<DownloadSelectionQuery></DownloadSelectionQuery> | ||
<FollowSelectionQuery></FollowSelectionQuery> | ||
<IndexSelectionQuery></IndexSelectionQuery> | ||
<LoadDefaultTags>true</LoadDefaultTags> | ||
<LoadDefaultJsTransforms>true</LoadDefaultJsTransforms> | ||
<visibility></visibility> | ||
<PdfGen> | ||
<ConverterType></ConverterType> | ||
<TimeoutSmall></TimeoutSmall> | ||
<TimeoutMedium></TimeoutMedium> | ||
<TimeoutLarge></TimeoutLarge> | ||
</PdfGen> | ||
<DeleteOnNetworkOrServerError>false</DeleteOnNetworkOrServerError> | ||
<Revision>1</Revision> | ||
<EnableNeuralIndexing>false</EnableNeuralIndexing> | ||
<NeuralSearchSelectionQuery></NeuralSearchSelectionQuery> | ||
<HttpCodesToRetry></HttpCodesToRetry> | ||
<LogPerformanceMetricsPeriod></LogPerformanceMetricsPeriod> | ||
<Mapping> | ||
<Name>id</Name> | ||
<Value>doc.url1</Value> | ||
<Description></Description> | ||
<Selection></Selection> | ||
<DefaultValue></DefaultValue> | ||
</Mapping> | ||
<DocCountLimitOnCollectProperties></DocCountLimitOnCollectProperties> | ||
</Sinequa> |