Skip to content

Commit

Permalink
Merge pull request ropensci#36 from mustberuss/master
Browse files Browse the repository at this point in the history
Updating fieldsdf
  • Loading branch information
crew102 authored Nov 30, 2024
2 parents bf3d452 + e4e13e4 commit e1ca36b
Show file tree
Hide file tree
Showing 3 changed files with 525 additions and 202 deletions.
211 changes: 140 additions & 71 deletions data-raw/fieldsdf.R
Original file line number Diff line number Diff line change
@@ -1,71 +1,140 @@
library(tidyverse)
library(devtools)
library(rapiclient)

load_all()

# Temp swagger API spec
# TODO(any): Update with actual PatentsView version after its issues are sorted
api <- get_api(url = "https://patentsview.historicip.com/swagger/openapi.json")

endpoint_paths <- names(api$paths)
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)]
endpoints <- gsub(".*/(\\w+)(/$)?", "\\1", endpoint_paths)
entities <-
sapply(endpoint_paths, function(y) {
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref`
gsub(".*/(\\w+)SuccessResponse", "\\1", success_response)
})

try_get_ref <- function(list) {
if ("items" %in% names(list)) {
gsub(".*/", "", list[["items"]][["$ref"]])
} else {
NA
}
}

extract_relevant_schema_info <- function(schema_elements) {
out_list <- lapply(schema_elements, function(schema_element) {
lapply(
api$components$schemas[[schema_element]]$properties,
function(x) data.frame(
type = x$type,
ref = try_get_ref(x)
)
) %>%
do.call(rbind, .) %>%
rownames_to_column() %>%
setNames(c("field", "data_type", "ref")) %>%
mutate(schema_element = schema_element)
})
do.call(rbind, out_list)
}

nonnested_elements <- extract_relevant_schema_info(entities)

schema_element_names <- names(api$components$schemas)
nested_elements <- schema_element_names[grepl("Nested$", schema_element_names)]
nested_elements <- c("YearlyPatents", nested_elements)
nested_elements <- extract_relevant_schema_info(nested_elements)

lookup <- sapply(endpoints, to_plural)
names(lookup) <- entities

fieldsdf <-
nonnested_elements %>%
left_join(nested_elements, by = c("ref" = "schema_element")) %>%
mutate(
common_name = ifelse(is.na(ref), field.x, field.y),
data_type = ifelse(is.na(ref), data_type.x, data_type.y),
group = ifelse(is.na(ref), lookup[schema_element], field.x),
endpoint = lookup[schema_element],
field = ifelse(is.na(ref), common_name, paste0(group, ".", common_name))
) %>%
mutate(data_type = ifelse(grepl("_date$", common_name), "date", data_type)) %>%
select(endpoint, field, data_type, group, common_name)

write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE)

use_data(fieldsdf, internal = FALSE, overwrite = TRUE)
use_data(fieldsdf, internal = TRUE, overwrite = TRUE)
library(tidyverse)
library(devtools)
library(rapiclient)

load_all()

# TODO(any): remove corrections when bugs are fixed

corrections <- read.table(
text = "endpoint field data_type
assignee assignee_type int
patent assignees.assignee_type int
patent/us_application_citation citation_document_number int
publication assignees.assignee_type int
publication rule_47_flag bool",
header = TRUE,
stringsAsFactors = FALSE
)

api <- get_api(url = "https://search.patentsview.org/static/openapi.json")

endpoint_paths <- names(api$paths)

# get rid of url parameter paths
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)]

# now we need to keep the parent portion of the nested patent/ and publication/ endpoints
endpoints <- sub("/api/v1/((patent/|publication/)?\\w+)/$", "\\1", endpoint_paths)

entities <-
sapply(endpoint_paths, function(y) {
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref`
gsub(".*/(\\w+SuccessResponse)", "\\1", success_response)
})

lookup <- endpoints
names(lookup) <- entities

# detect "type":"string", "format":"date" (which is normal)
# Not sure if the other checks are standard but they're used in the patentsview object

data_type_intuit <- function(field_definition) {
type <- field_definition$type
format <- if ("format" %in% names(field_definition)) field_definition$format else ""
example <- if ("example" %in% names(field_definition)) field_definition$example else ""
as_is_types <- c("integer", "boolean", "array")

if (type %in% as_is_types) {
type
} else if (type == "number") {
"integer"
} else if (format == "date") {
"date"
} else if (type == "string" && example == "double") {
"number"
} else {
type
}
}


# recurse if type is array?

extract_relevant_schema_info <- function(schema_elements) {
lapply(schema_elements, function(schema_element) {
middle <- lapply(
names(api$components$schemas[[schema_element]]$properties[[1]]$items$properties),
function(x, y) {
data_type <- data_type_intuit(y[[x]])

if (data_type == "array") {
group <- x

inner <- lapply(
names(y[[x]]$items$properties),
function(a, b) {
# only nested one deep- wouldn't be an array here
data.frame(
endpoint = lookup[[schema_element]],
field = paste0(group, ".", a),
data_type = data_type_intuit(b[[a]]),
group = group,
common_name = a
)
},
y[[x]]$items$properties
)

do.call(rbind, inner)
} else {
data.frame(
endpoint = lookup[[schema_element]],
field = x,
data_type = data_type,
group = "",
common_name = x
)
}
}, api$components$schemas[[schema_element]]$properties[[1]]$items$properties
)

do.call(rbind, middle)
}) %>%
do.call(rbind, .) %>%
arrange(endpoint, field) # sort so we can tell if the csv file changed
}

fieldsdf <- extract_relevant_schema_info(entities)

# TODO(any): remove hard coding corrections when possible

# We need to make two sets of corrections. First we make hard coded type corrections
# that we reported as bugs
fieldsdf <- fieldsdf %>%
left_join(corrections, by = c("endpoint", "field")) %>%
mutate(data_type = coalesce(data_type.y, data_type.x)) %>%
select(-data_type.x, -data_type.y) %>%
relocate(data_type, .after = field)

# The second set of corrections is to append "_id" to fields and common_names below.
# The API team may not concider this to be a bug. The OpenAPI object describes the
# API's return, not the requests we make (requests with the _id are returned without them)
# "patent","assignees.assignee","string","assignees","assignee"
# "patent","inventors.inventor","string","inventors","inventor"
# "publication","assignees.assignee","string","assignees","assignee"
# "publication","inventors.inventor","string","inventors","inventor"

add_id_to <- c("assignees.assignee", "inventors.inventor")

# change common_name first, condition isn't met if field is changed first DAMHIKT
fieldsdf <- fieldsdf %>%
mutate(
common_name = if_else(field %in% add_id_to, paste0(common_name, "_id"), common_name),
field = if_else(field %in% add_id_to, paste0(field, "_id"), field)
)

write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE)

use_data(fieldsdf, internal = FALSE, overwrite = TRUE)
use_data(fieldsdf, internal = TRUE, overwrite = TRUE)
Loading

0 comments on commit e1ca36b

Please sign in to comment.