add source config for ashby dbt

adgramigna · Oct 27, 2023 · ef1e286 · ef1e286
1 parent 14ef730
commit ef1e286
Showing 1 changed file with 184 additions and 0 deletions.
diff --git a/levergreen_dbt/models/staging/ashby/_ashby__sources.yml b/levergreen_dbt/models/staging/ashby/_ashby__sources.yml
@@ -0,0 +1,184 @@
+version: 2
+
+sources:
+  - name: ashby
+    description: >
+      Data from careers pages of companies who use ashby. Called from external Python script and cleaned slightly.
+    schema: public
+    loader: requests
+    loaded_at_field: to_timestamp(updated_at)
+    freshness:
+      warn_after: {count: 1, period: day}
+      error_after: {count: 2, period: day}
+
+    tables:
+      - name: ashby_job_departments
+        description: >
+          Contains job department information from ashby
+        tests:
+          - dbt_expectations.expect_table_columns_to_match_set:
+              column_list: ["id",
+                "levergreen_id", "created_at" , "updated_at", "company_name", "ashby_job_board_source",
+                 "run_hash", "raw_json_file_location", "existing_json_used", "department_id", "department_name",
+                  "parent_department_id"
+              ]
+        columns:
+          - name: id
+            description: serial id created by postgres upon insertion
+            tests:
+              - unique
+              - not_null
+          - name: levergreen_id
+            description: Id from Levergreen scraper. Unique relative to the HTML file used
+            tests:
+              - not_null
+          - name: created_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time. If we used existing HTML, this field is not updated.
+            tests:
+              - not_null
+          - name: updated_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time.
+            tests:
+              - not_null
+          - name: ashby_job_board_source
+            description: Ashby careers page source. 
+            tests:
+              - not_null
+          - name: company_name
+            description: Company name from ashby. Taken by grabbing the end of the source.
+            tests:
+              - not_null
+          - name: run_hash
+            description: Hashed value using hash ids to identify the id of a particular scraped
+          - name: existing_json_used
+            description: Boolean whether or not this was a fresh request
+          - name: raw_json_file_location
+            description: S3 bucket where the scraped data is stored
+          - name: department_id
+            description: Ashby specific id for the department
+          - name: parent_department_id
+            description: Parent id of the department.
+          - name: department_name
+            description: Name of the ashby Department
+
+
+      - name: ashby_job_locations
+        description: >
+          Contains job location information from ashby
+        tests:
+          - dbt_expectations.expect_table_columns_to_match_set:
+              column_list: ["id",
+                "levergreen_id", "created_at" , "updated_at", "company_name", "ashby_job_board_source",
+                  "run_hash", "raw_json_file_location", "existing_json_used", "opening_id", "secondary_location_id",
+                  "secondary_location_name"
+              ]
+        columns:
+          - name: id
+            description: serial id created by postgres upon insertion
+            tests:
+              - unique
+              - not_null
+          - name: levergreen_id
+            description: Id from Levergreen scraper. Unique relative to the HTML file used
+            tests:
+              - not_null
+          - name: created_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time. If we used existing HTML, this field is not updated.
+            tests:
+              - not_null
+          - name: updated_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time.
+            tests:
+              - not_null
+          - name: ashby_job_board_source
+            description: Ashby careers page source. 
+            tests:
+              - not_null
+          - name: company_name
+            description: Company name from ashby. Taken by grabbing the end of the source.
+            tests:
+              - not_null
+          - name: run_hash
+            description: Hashed value using hash ids to identify the id of a particular scraped
+          - name: existing_json_used
+            description: Boolean whether or not this was a fresh request
+          - name: raw_json_file_location
+            description: S3 bucket where the scraped data is stored
+          - name: secondary_location_id
+            description: Secondary Location ids from ashby
+          - name: secondary_location_name
+            description: Name of the secondary location
+          - name: opening_id
+            description: foreign key to tie the secondary location to
+
+      - name: ashby_jobs_outline
+        description: >
+          Contains job outline information from ashby
+        tests:
+          - dbt_expectations.expect_table_columns_to_match_set:
+              column_list: ["id",
+                "levergreen_id", "created_at" , "updated_at", "company_name", "ashby_job_board_source",
+                  "run_hash", "raw_json_file_location", "existing_json_used", "opening_id", "opening_name",
+                  "department_id", "location_id", "location_name", "employment_type", "compensation_tier",
+                  "opening_link"
+                  ]
+        columns:
+          - name: id
+            description: serial id created by postgres upon insertion
+            tests:
+              - unique
+              - not_null
+          - name: levergreen_id
+            description: Id from Levergreen scraper. Unique relative to the HTML file used
+            tests:
+              - not_null
+          - name: created_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time. If we used existing HTML, this field is not updated.
+            tests:
+              - not_null
+          - name: updated_at
+            description: Timestamp of when the ashby site was scraped, in UNIX time.
+            tests:
+              - not_null
+          - name: ashby_job_board_source
+            description: Ashby careers page source. 
+            tests:
+              - not_null
+          - name: company_name
+            description: Company name from ashby. Taken by grabbing the end of the source.
+            tests:
+              - not_null
+          - name: run_hash
+            description: Hashed value using hash ids to identify the id of a particular scraped
+          - name: existing_json_used
+            description: Boolean whether or not this was a fresh request
+          - name: raw_json_file_location
+            description: S3 bucket where the scraped data is stored
+          - name: opening_id
+            description: Job opening id from ashby
+            tests:
+              - not_null
+          - name: opening_name
+            description: Name of the job opening
+            tests:
+              - not_null
+          - name: department_id 
+            description: department_id of the job opening. Most specific opening is included (furthest level down).
+          - name: location_id
+            description: Primary location id
+            tests:
+              - not_null
+          - name: location_name
+            description: Name of the primary location
+            tests:
+              - not_null
+          - name: employment_type
+            description: Type of employment
+            tests:
+              - accepted_values:
+                  values: ["Intern", "Contract", "PartTime", "FullTime", "Temporary"]
+          - name: compensation_tier
+            description: Salary Range as text
+          - name: opening_link
+            description: Link of the job posting
+