forked from ropensci/patentsview
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ropensci#36 from mustberuss/master
Updating fieldsdf
- Loading branch information
Showing
3 changed files
with
525 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,71 +1,140 @@ | ||
library(tidyverse) | ||
library(devtools) | ||
library(rapiclient) | ||
|
||
load_all() | ||
|
||
# Temp swagger API spec | ||
# TODO(any): Update with actual PatentsView version after its issues are sorted | ||
api <- get_api(url = "https://patentsview.historicip.com/swagger/openapi.json") | ||
|
||
endpoint_paths <- names(api$paths) | ||
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)] | ||
endpoints <- gsub(".*/(\\w+)(/$)?", "\\1", endpoint_paths) | ||
entities <- | ||
sapply(endpoint_paths, function(y) { | ||
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref` | ||
gsub(".*/(\\w+)SuccessResponse", "\\1", success_response) | ||
}) | ||
|
||
try_get_ref <- function(list) { | ||
if ("items" %in% names(list)) { | ||
gsub(".*/", "", list[["items"]][["$ref"]]) | ||
} else { | ||
NA | ||
} | ||
} | ||
|
||
extract_relevant_schema_info <- function(schema_elements) { | ||
out_list <- lapply(schema_elements, function(schema_element) { | ||
lapply( | ||
api$components$schemas[[schema_element]]$properties, | ||
function(x) data.frame( | ||
type = x$type, | ||
ref = try_get_ref(x) | ||
) | ||
) %>% | ||
do.call(rbind, .) %>% | ||
rownames_to_column() %>% | ||
setNames(c("field", "data_type", "ref")) %>% | ||
mutate(schema_element = schema_element) | ||
}) | ||
do.call(rbind, out_list) | ||
} | ||
|
||
nonnested_elements <- extract_relevant_schema_info(entities) | ||
|
||
schema_element_names <- names(api$components$schemas) | ||
nested_elements <- schema_element_names[grepl("Nested$", schema_element_names)] | ||
nested_elements <- c("YearlyPatents", nested_elements) | ||
nested_elements <- extract_relevant_schema_info(nested_elements) | ||
|
||
lookup <- sapply(endpoints, to_plural) | ||
names(lookup) <- entities | ||
|
||
fieldsdf <- | ||
nonnested_elements %>% | ||
left_join(nested_elements, by = c("ref" = "schema_element")) %>% | ||
mutate( | ||
common_name = ifelse(is.na(ref), field.x, field.y), | ||
data_type = ifelse(is.na(ref), data_type.x, data_type.y), | ||
group = ifelse(is.na(ref), lookup[schema_element], field.x), | ||
endpoint = lookup[schema_element], | ||
field = ifelse(is.na(ref), common_name, paste0(group, ".", common_name)) | ||
) %>% | ||
mutate(data_type = ifelse(grepl("_date$", common_name), "date", data_type)) %>% | ||
select(endpoint, field, data_type, group, common_name) | ||
|
||
write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE) | ||
|
||
use_data(fieldsdf, internal = FALSE, overwrite = TRUE) | ||
use_data(fieldsdf, internal = TRUE, overwrite = TRUE) | ||
library(tidyverse) | ||
library(devtools) | ||
library(rapiclient) | ||
|
||
load_all() | ||
|
||
# TODO(any): remove corrections when bugs are fixed | ||
|
||
corrections <- read.table( | ||
text = "endpoint field data_type | ||
assignee assignee_type int | ||
patent assignees.assignee_type int | ||
patent/us_application_citation citation_document_number int | ||
publication assignees.assignee_type int | ||
publication rule_47_flag bool", | ||
header = TRUE, | ||
stringsAsFactors = FALSE | ||
) | ||
|
||
api <- get_api(url = "https://search.patentsview.org/static/openapi.json") | ||
|
||
endpoint_paths <- names(api$paths) | ||
|
||
# get rid of url parameter paths | ||
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)] | ||
|
||
# now we need to keep the parent portion of the nested patent/ and publication/ endpoints | ||
endpoints <- sub("/api/v1/((patent/|publication/)?\\w+)/$", "\\1", endpoint_paths) | ||
|
||
entities <- | ||
sapply(endpoint_paths, function(y) { | ||
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref` | ||
gsub(".*/(\\w+SuccessResponse)", "\\1", success_response) | ||
}) | ||
|
||
lookup <- endpoints | ||
names(lookup) <- entities | ||
|
||
# detect "type":"string", "format":"date" (which is normal) | ||
# Not sure if the other checks are standard but they're used in the patentsview object | ||
|
||
data_type_intuit <- function(field_definition) { | ||
type <- field_definition$type | ||
format <- if ("format" %in% names(field_definition)) field_definition$format else "" | ||
example <- if ("example" %in% names(field_definition)) field_definition$example else "" | ||
as_is_types <- c("integer", "boolean", "array") | ||
|
||
if (type %in% as_is_types) { | ||
type | ||
} else if (type == "number") { | ||
"integer" | ||
} else if (format == "date") { | ||
"date" | ||
} else if (type == "string" && example == "double") { | ||
"number" | ||
} else { | ||
type | ||
} | ||
} | ||
|
||
|
||
# recurse if type is array? | ||
|
||
extract_relevant_schema_info <- function(schema_elements) { | ||
lapply(schema_elements, function(schema_element) { | ||
middle <- lapply( | ||
names(api$components$schemas[[schema_element]]$properties[[1]]$items$properties), | ||
function(x, y) { | ||
data_type <- data_type_intuit(y[[x]]) | ||
|
||
if (data_type == "array") { | ||
group <- x | ||
|
||
inner <- lapply( | ||
names(y[[x]]$items$properties), | ||
function(a, b) { | ||
# only nested one deep- wouldn't be an array here | ||
data.frame( | ||
endpoint = lookup[[schema_element]], | ||
field = paste0(group, ".", a), | ||
data_type = data_type_intuit(b[[a]]), | ||
group = group, | ||
common_name = a | ||
) | ||
}, | ||
y[[x]]$items$properties | ||
) | ||
|
||
do.call(rbind, inner) | ||
} else { | ||
data.frame( | ||
endpoint = lookup[[schema_element]], | ||
field = x, | ||
data_type = data_type, | ||
group = "", | ||
common_name = x | ||
) | ||
} | ||
}, api$components$schemas[[schema_element]]$properties[[1]]$items$properties | ||
) | ||
|
||
do.call(rbind, middle) | ||
}) %>% | ||
do.call(rbind, .) %>% | ||
arrange(endpoint, field) # sort so we can tell if the csv file changed | ||
} | ||
|
||
fieldsdf <- extract_relevant_schema_info(entities) | ||
|
||
# TODO(any): remove hard coding corrections when possible | ||
|
||
# We need to make two sets of corrections. First we make hard coded type corrections | ||
# that we reported as bugs | ||
fieldsdf <- fieldsdf %>% | ||
left_join(corrections, by = c("endpoint", "field")) %>% | ||
mutate(data_type = coalesce(data_type.y, data_type.x)) %>% | ||
select(-data_type.x, -data_type.y) %>% | ||
relocate(data_type, .after = field) | ||
|
||
# The second set of corrections is to append "_id" to fields and common_names below. | ||
# The API team may not concider this to be a bug. The OpenAPI object describes the | ||
# API's return, not the requests we make (requests with the _id are returned without them) | ||
# "patent","assignees.assignee","string","assignees","assignee" | ||
# "patent","inventors.inventor","string","inventors","inventor" | ||
# "publication","assignees.assignee","string","assignees","assignee" | ||
# "publication","inventors.inventor","string","inventors","inventor" | ||
|
||
add_id_to <- c("assignees.assignee", "inventors.inventor") | ||
|
||
# change common_name first, condition isn't met if field is changed first DAMHIKT | ||
fieldsdf <- fieldsdf %>% | ||
mutate( | ||
common_name = if_else(field %in% add_id_to, paste0(common_name, "_id"), common_name), | ||
field = if_else(field %in% add_id_to, paste0(field, "_id"), field) | ||
) | ||
|
||
write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE) | ||
|
||
use_data(fieldsdf, internal = FALSE, overwrite = TRUE) | ||
use_data(fieldsdf, internal = TRUE, overwrite = TRUE) |
Oops, something went wrong.