Skip to content

Commit

Permalink
Add post load delay to Browsertrix (#1700)
Browse files Browse the repository at this point in the history
Fixes #1699 

Adds post load delay to:
- Backend `RawCrawlConfig` model
- Frontend (workflow editor and config details component)
- Workflow setup docs
  • Loading branch information
tw4l authored Apr 19, 2024
1 parent 9609ff4 commit 80008a2
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 0 deletions.
1 change: 1 addition & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ class RawCrawlConfig(BaseModel):
behaviorTimeout: Optional[int]
pageLoadTimeout: Optional[int]
pageExtraDelay: Optional[int] = 0
postLoadDelay: Optional[int] = 0

workers: Optional[int] = None

Expand Down
4 changes: 4 additions & 0 deletions docs/user-guide/workflow-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ Increasing the amount of crawler instances will speed up crawls by using additio

Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.

### Delay After Page Load

Waits on the page after initial HTML page load for a set number of seconds prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.

### Behavior Timeout

Limits amount of elapsed time behaviors have to complete.
Expand Down
4 changes: 4 additions & 0 deletions frontend/src/components/ui/config-details.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ export class ConfigDetails extends LiteElement {
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity,
),
)}
${this.renderSetting(
msg("Delay After Page Load"),
renderTimeLimit(crawlConfig?.config.postLoadDelay, 0),
)}
${this.renderSetting(
msg("Page Behavior Timeout"),
renderTimeLimit(
Expand Down
23 changes: 23 additions & 0 deletions frontend/src/pages/org/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ type FormState = {
behaviorTimeoutSeconds: number | null;
pageLoadTimeoutSeconds: number | null;
pageExtraDelaySeconds: number | null;
postLoadDelaySeconds: number | null;
maxCrawlSizeGB: number;
maxScopeDepth: number | null;
scopeType: WorkflowParams["config"]["scopeType"];
Expand Down Expand Up @@ -184,6 +185,7 @@ const getDefaultFormState = (): FormState => ({
behaviorTimeoutSeconds: null,
pageLoadTimeoutSeconds: null,
pageExtraDelaySeconds: null,
postLoadDelaySeconds: null,
maxScopeDepth: null,
scopeType: "host",
exclusions: [],
Expand Down Expand Up @@ -580,6 +582,8 @@ export class CrawlConfigEditor extends LiteElement {
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
pageExtraDelaySeconds:
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
postLoadDelaySeconds:
seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds,
maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth,
scale: this.initialWorkflow.scale,
blockAds: this.initialWorkflow.config.blockAds,
Expand Down Expand Up @@ -1585,6 +1589,24 @@ https://archiveweb.page/images/${"logo.svg"}`}
`Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`,
),
)}
${this.renderFormCol(html`
<sl-input
name="postLoadDelaySeconds"
type="number"
inputmode="numeric"
label=${msg("Delay After Page Load")}
placeholder=${"Default: 0"}
value=${ifDefined(this.formState.postLoadDelaySeconds ?? undefined)}
min="0"
>
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(
`Waits on the page after initial HTML page load prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.`,
),
)}
${this.renderFormCol(html`
<sl-input
name="behaviorTimeoutSeconds"
Expand Down Expand Up @@ -2445,6 +2467,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
pageExtraDelay: this.formState.pageExtraDelaySeconds,
postLoadDelay: this.formState.postLoadDelaySeconds,
userAgent: this.formState.userAgent,
limit: this.formState.pageLimit,
lang: this.formState.lang || "",
Expand Down
1 change: 1 addition & 0 deletions frontend/src/pages/org/workflows-new.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const defaultValue = {
behaviorTimeout: null,
pageLoadTimeout: null,
pageExtraDelay: null,
postLoadDelay: null,
useSitemap: false,
failOnFailedSeed: false,
userAgent: null,
Expand Down
1 change: 1 addition & 0 deletions frontend/src/types/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export type SeedConfig = Expand<
behaviorTimeout: number | null;
pageLoadTimeout: number | null;
pageExtraDelay: number | null;
postLoadDelay: number | null;
behaviors?: string | null;
extraHops?: number | null;
useSitemap?: boolean;
Expand Down

0 comments on commit 80008a2

Please sign in to comment.