Skip to content

Commit

Permalink
Rewrite metadata parsing for more consistent and pythonic outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
yannforget committed Mar 1, 2021
1 parent cc3ae96 commit da151b6
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 59 deletions.
195 changes: 155 additions & 40 deletions landsatxplore/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,19 @@
from urllib.parse import urljoin
import string
import random
from datetime import datetime
from dateutil import parser
import time

import requests
from shapely.geometry import Point
from shapely.geometry import Point, shape

from landsatxplore.errors import USGSAuthenticationError, USGSError, USGSRateLimitError


API_URL = "https://m2m.cr.usgs.gov/api/api/json/stable/"


def _random_string(length=10):
"""Generate a random string."""
letters = string.ascii_lowercase
return "".join(random.choice(letters) for i in range(length))


class API(object):
"""EarthExplorer API."""

Expand Down Expand Up @@ -161,36 +157,7 @@ def get_entity_id(self, display_id, dataset):
else:
return entity_id

@staticmethod
def parse_metadata(response):
"""Parse metadata from API response.
Parameters
----------
response : requests response
As returned by api.request("scene-metadata").
Returns
-------
scene_metadata : dict
Metadata parsed into a dict.
"""
scene_metadata = {}
for key, value in response.items():
if key in ("browse", "metadata"):
continue
else:
scene_metadata[key] = value

for field in response["metadata"]:
label = field["dictionaryLink"].split("#")[-1].strip()
if label in ("coordinate_degrees", "coordinate_decimal"):
continue
scene_metadata[label] = field["value"]

return scene_metadata

def metadata(self, entity_id, dataset):
def metadata(self, entity_id, dataset, browse=False):
"""Get metadata for a given scene.
Parameters
Expand All @@ -199,6 +166,8 @@ def metadata(self, entity_id, dataset):
Landsat Scene ID or Sentinel Entity ID.
dataset : str
Dataset alias.
browse : bool, optional
Include browse (LandsatLook URLs) metadata items.
Returns
-------
Expand All @@ -213,7 +182,7 @@ def metadata(self, entity_id, dataset):
"metadataType": "full",
},
)
return self.parse_metadata(r)
return _parse_metadata(r, parse_browse_field=browse)

def get_display_id(self, entity_id, dataset):
"""Get display ID from entity ID.
Expand All @@ -231,7 +200,7 @@ def get_display_id(self, entity_id, dataset):
Landsat Product ID or Sentinel Display ID.
"""
meta = self.metadata(entity_id, dataset)
return meta["displayId"]
return meta["display_id"]

def search(
self,
Expand Down Expand Up @@ -302,7 +271,153 @@ def search(
"metadataType": "full",
},
)
return [self.parse_metadata(scene) for scene in r.get("results")]
return [_parse_metadata(scene) for scene in r.get("results")]


def _random_string(length=10):
"""Generate a random string."""
letters = string.ascii_lowercase
return "".join(random.choice(letters) for i in range(length))


def _title_to_snake(src_string):
"""Convert title case to snake_case."""
return src_string.lower().replace(" ", "_").replace("/", "-")


def _camel_to_snake(src_string):
"""Convert camelCase string to snake_case."""
dst_string = [src_string[0].lower()]
for c in src_string[1:]:
if c in ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
dst_string.append("_")
dst_string.append(c.lower())
else:
dst_string.append(c)
return "".join(dst_string)


def _to_num(src_string):
"""Convert string to int or float if possible.
Original value is returned if conversion failed.
"""
if not isinstance(src_string, str):
return src_string
src_string = src_string.strip()
try:
return int(src_string)
except ValueError:
try:
return float(src_string)
except ValueError:
return src_string


def _to_date(src_string):
"""Convert string to datetime if possible.
Original value is returned if conversion failed.
"""
if not isinstance(src_string, str):
return src_string
try:
return parser.parse(src_string)
except parser.ParserError:
try:
# Specific date format for start_time and end_time
nofrag, frag = src_string.split(".")
dtime = datetime.strptime(nofrag, "%Y:%j:%H:%M:%S")
dtime = dtime.replace(microsecond=int(frag[:6]))
return dtime
except ValueError:
pass
return src_string


def _parse_value(src_value):
"""Try to convert value to numeric or date if possible.
Original value is returned if conversion failed.
"""
dst_value = src_value
if isinstance(dst_value, str):
dst_value = dst_value.strip()
dst_value = _to_num(dst_value)
dst_value = _to_date(dst_value)
return dst_value


def _parse_browse_metadata(src_meta):
"""Parse the browse field returned by the API."""
dst_meta = {}
for product in src_meta:
name = _title_to_snake(product["browseName"])
dst_meta[name] = {}
for field, value in product.items():
dst_meta[name][_camel_to_snake(field)] = value
return dst_meta


def _parse_metadata_field(src_meta):
"""Parse the metadata field returned by the API."""
dst_meta = {}
for meta in src_meta:
# Convert field name to snake case
name = _title_to_snake(meta["fieldName"])
# Abbreviate "identifier" by "id" for shorter names
name = name.replace("identifier", "id")
# Always use "acquisition_date" instead of "acquired_date" for consistency
if name == "date_acquired":
name = "acquisition_date"
# Remove processing-level information in field names for consistency
name = name.replace("_l1", "")
name = name.replace("_l2", "")
# Dictionary link URL also provides some information on the field
dict_id = meta.get("dictionaryLink").split("#")[-1].strip()
# Do not process this field
if dict_id == "coordinates_degrees":
continue
# Sentinel metadata has an "Entity ID" field that would
# conflict with the API entityId field
if name == "entity_id":
name = "sentinel_entity_id"
# Do not parse numeric IDs. Keep them as strings.
if name.endswith("_id"):
dst_meta[name] = str(meta.get("value")).strip()
else:
dst_meta[name] = _parse_value(meta.get("value"))
return dst_meta


def _parse_metadata(response, parse_browse_field=False):
"""Parse the full response returned by the API when requesting metadata."""
metadata = {}
for key, value in response.items():
name = _camel_to_snake(key)
if key == "browse":
if parse_browse_field:
metadata[name] = _parse_browse_metadata(value)
else:
continue
elif key == "spatialCoverage":
metadata[name] = shape(value)
elif key == "spatialBounds":
metadata[name] = shape(value).bounds
elif key == "temporalCoverage":
start, end = value["endDate"], value["startDate"]
metadata[name] = [_to_date(start), _to_date(end)]
elif key == "metadata":
metadata.update(_parse_metadata_field(value))
else:
# Do not parse numeric IDs. Keep them as strings.
if name.endswith("_id"):
metadata[name] = str(value).strip()
else:
metadata[name] = _parse_value(value)
if "acquisition_date" not in metadata:
metadata["acquisition_date"] = metadata["temporal_coverage"][0]
return metadata


class Coordinate(dict):
Expand Down
4 changes: 2 additions & 2 deletions landsatxplore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@ def search(

if output == "entity_id":
for scene in results:
click.echo(scene["entityId"])
click.echo(scene["entity_id"])

if output == "display_id":
for scene in results:
click.echo(scene["displayId"])
click.echo(scene["display_id"])

if output == "json":
dump = json.dumps(results, indent=True)
Expand Down
17 changes: 17 additions & 0 deletions landsatxplore/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,20 @@ def guess_dataset(identifier):
return "sentinel_2a"
else:
raise LandsatxploreError("Failed to guess dataset from identifier.")


def title_to_snake(src_string):
"""Convert title string to snake_case."""
return src_string.lower().replace(" ", "_").replace("/", "-")


def camel_to_snake(src_string):
"""Convert camelCase string to snake_case."""
dst_string = [src_string[0].lower()]
for c in src_string[1:]:
if c in ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
dst_string.append("_")
dst_string.append(c.lower())
else:
dst_string.append(c)
return "".join(dst_string)
47 changes: 30 additions & 17 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import pytest
import os
from datetime import datetime
from shapely.geometry import Polygon
from landsatxplore import api, errors
from landsatxplore import api, errors, util


BRUSSELS_AREA = Polygon(
Expand Down Expand Up @@ -99,19 +100,31 @@ def test_api_get_scene_id(ee_api):

def test_api_metadata(ee_api):

# Collection 1
SCENE_ID = "LT51730582011301MLK00"
DATASET = "landsat_tm_c1"
metadata = ee_api.metadata(SCENE_ID, DATASET)
assert metadata["entityId"] == SCENE_ID
assert metadata["landsat_scene_id"] == SCENE_ID
PRODUCTS = [
"LT05_L1GS_173058_20111028_20161005_01_T2",
"LE07_L1TP_173058_20200926_20201022_01_T1",
"LC08_L1TP_173058_20201004_20201015_01_T1",
"LT05_L1TP_173058_20111028_20200820_02_T1",
"LT05_L2SP_173058_20111028_20200820_02_T1",
"LE07_L1TP_173058_20200926_20201022_02_T1",
"LE07_L2SP_173058_20200926_20201022_02_T1",
"LC08_L1TP_173058_20201004_20201015_02_T1",
"LC08_L2SP_173058_20201004_20201016_02_T1",
"L1C_T30QXG_A027990_20201031T103908",
]

# Collection 2
SCENE_ID = "LT51730582011301MLK00"
DATASET = "landsat_tm_c2_l1"
metadata = ee_api.metadata(SCENE_ID, DATASET)
assert metadata["entityId"] == SCENE_ID
assert metadata["collection_number"] == 2
for display_id in PRODUCTS:
dataset = util.guess_dataset(display_id)
entity_id = ee_api.get_entity_id(display_id, dataset)
metadata = ee_api.metadata(entity_id, dataset)
assert isinstance(metadata["cloud_cover"], float)
assert isinstance(metadata["acquisition_date"], datetime)
if dataset.startswith("landsat"):
assert util._is_landsat_product_id(metadata["landsat_product_id"])
assert util._is_landsat_scene_id(metadata["landsat_scene_id"])
elif dataset.startswith("sentinel"):
assert util._is_sentinel_display_id(metadata["display_id"])
assert util._is_sentinel_entity_id(metadata["entity_id"])


def test_api_get_product_id(ee_api):
Expand Down Expand Up @@ -139,7 +152,7 @@ def test_api_search(ee_api):
max_results=5,
)
assert len(scenes) >= 1
assert "cloudCover" in scenes[0]
assert "cloud_cover" in scenes[0]

# Bounding box
scenes = ee_api.search(
Expand All @@ -150,7 +163,7 @@ def test_api_search(ee_api):
max_results=5,
)
assert len(scenes) >= 1
assert "cloudCover" in scenes[0]
assert "cloud_cover" in scenes[0]

# Collection 2
scenes = ee_api.search(
Expand All @@ -162,5 +175,5 @@ def test_api_search(ee_api):
max_results=10,
)
assert len(scenes) >= 1
assert "cloudCover" in scenes[0]
assert scenes[0]["displayId"][5:7] == "L2"
assert "cloud_cover" in scenes[0]
assert scenes[0]["display_id"][5:7] == "L2"

0 comments on commit da151b6

Please sign in to comment.