cleaning duplicates (#4)

hotgluexyz · Aug 1, 2024 · b36b558 · b36b558
1 parent 4e64c65
commit b36b558
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 0 deletions.
diff --git a/tap_pipedrive/stream.py b/tap_pipedrive/stream.py
@@ -7,6 +7,9 @@
 
 
 class PipedriveStream(object):
+    def __init__(self):
+        self.ids = []
+
     tap = None
     endpoint = ''
     key_properties = []

diff --git a/tap_pipedrive/tap.py b/tap_pipedrive/tap.py
@@ -278,7 +278,15 @@ def do_paginate(self, stream, stream_metadata):
             # records with metrics
             with singer.metrics.record_counter(stream.schema) as counter:
                 with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime:
+                    stream_name = stream.get_name()
                     for row in self.iterate_response(response):
+                        # logic to avoid duplicates HGI-6285
+                        if row["id"] not in stream.ids:
+                            stream.ids.append(row["id"])
+                        else:
+                            logger.info(f"id '{row['id']}' was previously fetched and processed for {stream_name}, skipping duplicate value...")
+                            continue
+
                         row = stream.process_row(row)
                         if not row: # in case of a non-empty response with an empty element
                             continue