Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Vocabularise UCDs #31

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
# this is defined in Vocabularies in the VO 2
KNOWN_PREDICATES = frozenset([
"ivoasem:preliminary", "ivoasem:deprecated", "ivoasem:useInstead",
"ivoasem:UCDSyntaxCode",
"rdfs:subClassOf",
"rdfs:subPropertyOf",
"skos:broader", "skos:exactMatch",
Expand All @@ -62,7 +63,7 @@
FULL_TERM_PATTERN = "[\w\d#:/_.*%-]+"

# an RE our terms themselves must match
TERM_PATTERN = "[\w\d_-]+"
TERM_PATTERN = "[\w\d_.-]+"

IVOA_RDF_URI = "http://www.ivoa.net/rdf/"

Expand Down Expand Up @@ -650,10 +651,16 @@ def _parse_relations(self, relations):
"""
for predicate, obj in self._iter_relationship_literals(relations):
# a little hack: URI-fy plain objects by making them part of
# the current vocabulary
# the current vocabulary; use a backquote (*almost* LISP,
# but we don't want to confuse CSV format sniffers) to
# suppress that; of course, we need to hex away that backquote
# again.
if obj and re.match(TERM_PATTERN+"$", obj):
obj = "#"+obj

if obj.startswith("`"):
obj = obj[1:]

self._add_relation(predicate, obj)

def get_objects_for(self, predicate):
Expand Down Expand Up @@ -721,7 +728,9 @@ def _format_more_relations(self):
("ivoasem:deprecated", "Deprecated Term"),
("skos:exactMatch", "Same As"),
("skos:related", "Related"),
("built-in:narrower", "Narrower")]:
("built-in:narrower", "Narrower"),
("ivoasem:UCDSyntaxCode", "UCD Syntax"),
]:

if prop=="built-in:narrower":
objs = [self._format_term_as_html(t)
Expand Down
8 changes: 8 additions & 0 deletions ucd/migration/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The UCD vocabulary was migrated in 202? from the then-current ASCII
UCD list, published back then in an endorsed note, <TODO>.

This is the tooling that did the migration. The commandline is

python convert_list_to_csv.py > ../terms.csv

You do not want to repeat this after the initial migration.
30 changes: 30 additions & 0 deletions ucd/migration/convert_list_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
read the ucd-list.txt as from the UCDList EN and make a CSV for
vocabulary tooling out of it.

This is based on the parsing script in the EN source.
"""


def convert_line(line):
pieces = line.split('|')
if len(pieces) == 3:
return [s.strip() for s in pieces]
else:
# if it didn't parse, it was not in the normative EN list either,
# and so we skip it, too.
pass


def ucd_to_csv(in_file):
for cur_line in in_file:
if cur_line.startswith("#"):
continue
syncode, word, description = convert_line(cur_line)
assert '"' not in description
print(f"{word};1;{word};\"{description}\";"
f"ivoasem:UCDSyntaxCode(`{syncode})")

if __name__ == '__main__':
with open("ucd-list.txt", 'r') as f:
ucd_to_csv(f)
Loading