-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprefect.yaml
293 lines (287 loc) · 8.53 KB
/
prefect.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Welcome to your prefect.yaml file! You can use this file for storing and managing
# configuration for deploying your flows. We recommend committing this file to source
# control along with your flow code.
# Generic metadata about this project
name: prosnet-prefect-pipelines
prefect-version: 2.8.3+1589.g1dd04fcae
# build section allows you to manage and build docker images
build:
# push section allows you to manage if and how this project is uploaded to remote locations
push:
# pull section allows you to provide instructions for cloning this project in remote locations
pull:
- prefect.deployments.steps.run_shell_script:
id: install-git-lfs
script: |
apt-get update
apt-get install -y git-lfs
git lfs install
- prefect.deployments.steps.git_clone:
id: clone-step
repository: https://github.com/acdh-oeaw/prosnet-prefect-pipelines.git
branch: main
access_token:
- prefect.deployments.steps.pip_install_requirements:
id: install-deps-project
directory: '{{ clone-step.directory }}'
requirements_file: requirements.txt
stream_output: false
# the deployments section allows you to provide configuration for deploying flows
deployments:
- name: default
version:
tags: []
description: Create a typesense index from a SPARQL data.
entrypoint:
prosnet-prefect-pipelines/wikidata_index.py:create_typesense_index_from_sparql_query
parameters: {}
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedule:
is_schedule_active: true
- name: prosnet_index_wikidata_persons
version: 0.1.1
tags:
- wikidata
- typesense
- prosnet
- persons
description: Create a Person index from Wikidata. Runs incremental updates every
day at 2:00 UTC. Currently the index is limited to persons born in a predecessor
of Austria.
entrypoint:
prosnet-prefect-pipelines/wikidata_index.py:create_typesense_index_from_sparql_query
parameters:
params:
limit: 200
typesense_definition:
name: prosnet-wikidata-person-index
fields:
- name: id
type: string
- name: description
type: string
optional: true
- name: label
type: string
- name: name
type: string
optional: true
- name: date_of_birth
type: string
optional: true
- name: date_of_death
type: string
optional: true
- name: place_of_birth
type: string
optional: true
- name: place_of_death
type: string
optional: true
incremental_date: 2
typesense_collection_name: prosnet-wikidata-person-index
path_sparql_query: prosnet-prefect-pipelines/sparql/wikidata-person.sparql
field_mapping:
itemLabel: name
place_of_birthLabel: place_of_birth
place_of_deathLabel: place_of_death
data_postprocessing_functions:
date_of_birth: date_postprocessing
date_of_death: date_postprocessing
label_creator_function: label_creator_person
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedule:
cron: 0 2 * * *
timezone: UTC
day_or: true
active: true
- name: prosnet_index_geonames_places
version: 0.1.1
tags:
- geonames
- typesense
- prosnet
- places
description: Create a Place index from Geonames. Uses the download page of Geonames
to get tsv data and create the index. Currently places with more than 1000 inhabitants
are indexed. Currently runs every Monday at 3:00 UTC.
entrypoint:
prosnet-prefect-pipelines/geonames_place_index.py:create_typesense_place_index_from_geonames
parameters:
params:
tsv_location: https://download.geonames.org/export/dump/cities1000.zip
schedule:
cron: 0 3 * * 1
timezone: UTC
day_or: true
active: true
work_pool:
name: intavia
work_queue_name:
job_variables: {}
- name: prosnet_index_wikidata_places
version: 0.1.1
tags:
- wikidata
- typesense
- prosnet
- places
description: Create a Place index from Wikidata. Runs incremental updates every
day at 4:00 UTC. Currently the index is limited to places that are cities or higher.
entrypoint:
prosnet-prefect-pipelines/wikidata_index.py:create_typesense_index_from_sparql_query
parameters:
params:
limit: 200
typesense_definition:
name: prosnet-wikidata-place-index
fields:
- name: id
type: string
- name: label
type: string
- name: name
type: string
optional: true
- name: country
type: string
optional: true
- name: feature_code
type: string
optional: true
- name: coordinates
type: geopoint
optional: true
incremental_date: 2
typesense_collection_name: prosnet-wikidata-place-index
path_sparql_query: prosnet-prefect-pipelines/sparql/wikidata-city.sparql
field_mapping:
itemLabel: name
countryLabel: country
coord: coordinates
data_postprocessing_functions:
coordinates: geopoint_creator
feature_code: feature_code_postprocessing
label_creator_function: label_creator_place
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedule:
cron: 0 4 * * *
timezone: UTC
day_or: true
active: true
- name: prosnet_index_wikidata_organizations
version: 0.1.6
tags:
- wikidata
- typesense
- prosnet
- organizations
description: Create a Organization index from Wikidata. Runs incremental updates
every day at 1:00 UTC. Currently the index is limited to organizations located
in a predecessor of Austria.
entrypoint:
prosnet-prefect-pipelines/wikidata_index.py:create_typesense_index_from_sparql_query
parameters:
params:
limit: 100
typesense_definition:
name: prosnet-wikidata-organization-index
fields:
- name: id
type: string
- name: description
type: string
optional: true
- name: label
type: string
- name: name
type: string
optional: true
- name: inception
type: string
optional: true
- name: dissolvement
type: string
optional: true
incremental_date: 2
typesense_collection_name: prosnet-wikidata-organization-index
path_sparql_query: prosnet-prefect-pipelines/sparql/wikidata-organization.sparql
field_mapping:
organizationLabel: name
data_postprocessing_functions:
inception: cocatenated_dates_postprocessing
dissolvement: cocatenated_dates_postprocessing
label_creator_function: label_creator_organization
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedule:
cron: 0 1 * * *
timezone: UTC
day_or: true
active: true
- name: push-to-pfp-source-data-repo
version:
tags: []
description:
entrypoint: pfp-prefect-pipelines/push_rdf_file_to_github_gitlab.py:push_data_to_repo_flow
parameters: {}
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedules: []
- name: default
version:
tags: []
description: Flow that fetches data from APIs and pushes it to a GitLab repository.
entrypoint:
pfp-prefect-pipelines/get_apis_data_and_push_to_repo.py:get_apis_data_and_push_to_gitlab
parameters: {}
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedules: []
- name: pio_data_update
version:
tags:
- rdf
- qlever
description: Flow that fetches data from APIs and pushes it to a GitLab repository.
entrypoint:
pfp-prefect-pipelines/get_apis_data_and_push_to_repo.py:get_apis_data_and_push_to_gitlab
parameters:
params:
get_params:
accept_header: "text/ttl"
secret_token: "oebl-pfp-api-token"
limit: 200
api_url: "https://oebl-pfp.acdh-ch-dev.oeaw.ac.at/apis/api/apis_ontology."
swagger_url: "https://oebl-pfp.acdh-ch-dev.oeaw.ac.at/apis/swagger/schema/"
swagger_tags:
- "rdfexport"
output_path: "pio.ttl"
push_params:
repo: "acdh-ch/pfp/pfp-source-data"
username_secret: "gitlab-source-data-username"
password_secret: "gitlab-source-data-password"
git_provider: "oeaw-gitlab"
branch_name: "pio_branch_4"
file_path: "pio.ttl"
file_path_git: "datasets/pio.ttl"
work_pool:
name: intavia
work_queue_name:
job_variables: {}
schedules: []