Skip to content

Commit

Permalink
small workaround to deal with html comments in JSON-LD
Browse files Browse the repository at this point in the history
  • Loading branch information
huberrob committed Oct 16, 2023
1 parent 6a10273 commit f167b51
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions fuji_server/harvester/metadata_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,12 +615,19 @@ def retrieve_metadata_embedded_extruct(self):
except Exception:
extruct_target = self.landing_html
pass

try:
self.logger.info(
"{} : Trying to identify EMBEDDED Microdata, OpenGraph or Schema.org -: {}".format(
"FsF-F2-01M", self.landing_url
)
)
# remove html comments which sometimes fails in extruct...
try:
extruct_target = re.sub("(<!--.*?-->)", "", extruct_target.decode("utf-8"))
except Exception:
pass

extracted = extruct.extract(extruct_target, syntaxes=syntaxes, encoding="utf-8")
except Exception as e:
extracted = {}
Expand Down

0 comments on commit f167b51

Please sign in to comment.