From f167b513d095016f770957c0076c316f2367a5b7 Mon Sep 17 00:00:00 2001 From: huberrob Date: Mon, 16 Oct 2023 14:33:45 +0200 Subject: [PATCH] small workaround to deal with html comments in JSON-LD --- fuji_server/harvester/metadata_harvester.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 0bd10450..6b1960c6 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -615,12 +615,19 @@ def retrieve_metadata_embedded_extruct(self): except Exception: extruct_target = self.landing_html pass + try: self.logger.info( "{} : Trying to identify EMBEDDED Microdata, OpenGraph or Schema.org -: {}".format( "FsF-F2-01M", self.landing_url ) ) + # remove html comments which sometimes fails in extruct... + try: + extruct_target = re.sub("()", "", extruct_target.decode("utf-8")) + except Exception: + pass + extracted = extruct.extract(extruct_target, syntaxes=syntaxes, encoding="utf-8") except Exception as e: extracted = {}