Add another batch key chunk try with a smaller chunk size

hotgluexyz · Oct 7, 2024 · 6e25070 · 6e25070
1 parent ab6a26b
commit 6e25070
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 50 deletions.
diff --git a/tap_salesforce/salesforce/bulk.py b/tap_salesforce/salesforce/bulk.py
@@ -18,7 +18,8 @@
 BATCH_STATUS_POLLING_SLEEP = 20
 PK_CHUNKED_BATCH_STATUS_POLLING_SLEEP = 60
 ITER_CHUNK_SIZE = 1024
-DEFAULT_CHUNK_SIZE = 100000 # Max is 250000
+DEFAULT_CHUNK_SIZE = 250000 # Max is 250000
+FALLBACK_CHUNK_SIZE = 2000
 
 LOGGER = singer.get_logger()
 
@@ -113,49 +114,60 @@ def _can_pk_chunk_job(self, failure_message): # pylint: disable=no-self-use
                "Retried more than 15 times" in failure_message or \
                "Failed to write query result" in failure_message
 
+    def try_bulking_with_pk_chunking(self, catalog_entry, state, use_fall_back_chunk_size=False):
+        start_date = self.sf.get_start_date(state, catalog_entry)
+        batch_status = self._bulk_query_with_pk_chunking(catalog_entry, start_date, use_fall_back_chunk_size)
+        job_id = batch_status['job_id']
+        self.sf.pk_chunking = True
+        # Write job ID and batch state for resumption
+        tap_stream_id = catalog_entry['tap_stream_id']
+        self.tap_stream_id = tap_stream_id
+        state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id)
+        state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batch_status['completed'][:])
+
+        # Parallelize the batch result processing
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.process_batch, job_id, completed_batch_id, catalog_entry, state)
+                for completed_batch_id in batch_status['completed']
+            ]
+
+            # Process the results as they complete
+            for future in futures:
+                for result in future.result():
+                    yield result
+
     def _bulk_query(self, catalog_entry, state):
         try:
-            start_date = self.sf.get_start_date(state, catalog_entry)
-            batch_status = self._bulk_query_with_pk_chunking(catalog_entry, start_date)
-            job_id = batch_status['job_id']
-            self.sf.pk_chunking = True
-            # Write job ID and batch state for resumption
-            tap_stream_id = catalog_entry['tap_stream_id']
-            state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id)
-            state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batch_status['completed'][:])
-
-            # Parallelize the batch result processing
-            with ThreadPoolExecutor() as executor:
-                futures = [
-                    executor.submit(self.process_batch, job_id, completed_batch_id, catalog_entry, state)
-                    for completed_batch_id in batch_status['completed']
-                ]
-
-                # Process the results as they complete
-                for future in futures:
-                    for result in future.result():
-                        yield result
+            yield from self.try_bulking_with_pk_chunking(catalog_entry, state)
         except Exception as e:
-            if job_id in self.closed_jobs:
-                LOGGER.info(f"Another batch failed before. Ignoring this new job...")
-                pass
-            LOGGER.info(f"PK Chunking failled on job {job_id}. Trying without it...")
-            self._close_job(job_id)
-
-            job_id = self._create_job(catalog_entry)
-            start_date = self.sf.get_start_date(state, catalog_entry)
-            self.sf.pk_chunking = False
-
-            batch_id = self._add_batch(catalog_entry, job_id, start_date)
-
-            self._close_job(job_id)
-
-            batch_status = self._poll_on_batch_status(job_id, batch_id)
-            if batch_status['state'] == 'Failed':
-                raise TapSalesforceException(batch_status['stateMessage'])
-            else:
-                for result in self.get_batch_results(job_id, batch_id, catalog_entry):
-                    yield result
+            try:
+                yield from self.try_bulking_with_pk_chunking(catalog_entry, state, True)
+            except Exception as e:
+                if job_id in self.closed_jobs:
+                    LOGGER.info(f"Another batch failed before. Ignoring this new job...")
+                    pass
+                LOGGER.info(f"PK Chunking failled on job {job_id}. Trying without it...")
+                self._close_job(job_id)
+
+                if hasattr(self,"tap_stream_id"):
+                    with open("streams_pk_chunking_failing.txt", "a") as file:
+                        file.write(self.tap_stream_id + "\n")  # Append data with a newline character
+
+                job_id = self._create_job(catalog_entry)
+                start_date = self.sf.get_start_date(state, catalog_entry)
+                self.sf.pk_chunking = False
+
+                batch_id = self._add_batch(catalog_entry, job_id, start_date)
+
+                self._close_job(job_id)
+
+                batch_status = self._poll_on_batch_status(job_id, batch_id)
+                if batch_status['state'] == 'Failed':
+                    raise TapSalesforceException(batch_status['stateMessage'])
+                else:
+                    for result in self.get_batch_results(job_id, batch_id, catalog_entry):
+                        yield result
 
     def process_batch(self, job_id, batch_id, catalog_entry, state):
         """Process a single batch and yield results."""
@@ -168,11 +180,11 @@ def process_batch(self, job_id, batch_id, catalog_entry, state):
         LOGGER.info("Batches to go: %d", len(state['bookmarks'][catalog_entry['tap_stream_id']]["BatchIDs"]))
         singer.write_state(state)
 
-    def _bulk_query_with_pk_chunking(self, catalog_entry, start_date):
+    def _bulk_query_with_pk_chunking(self, catalog_entry, start_date, use_fall_back_chunk_size=False):
         LOGGER.info("Trying Bulk Query with PK Chunking")
 
         # Create a new job
-        job_id = self._create_job(catalog_entry, True)
+        job_id = self._create_job(catalog_entry, True, use_fall_back_chunk_size)
 
         self._add_batch(catalog_entry, job_id, start_date, False)
 
@@ -187,7 +199,7 @@ def _bulk_query_with_pk_chunking(self, catalog_entry, start_date):
 
         return batch_status
 
-    def _create_job(self, catalog_entry, pk_chunking=False):
+    def _create_job(self, catalog_entry, pk_chunking=False, use_fall_back_chunk_size=False):
         url = self.bulk_url.format(self.sf.instance_url, "job")
         body = {"operation": "queryAll", "object": catalog_entry['stream'], "contentType": "CSV"}
 
@@ -196,8 +208,9 @@ def _create_job(self, catalog_entry, pk_chunking=False):
 
         if pk_chunking:
             LOGGER.info("ADDING PK CHUNKING HEADER")
-
-            headers['Sforce-Enable-PKChunking'] = "true; chunkSize={}".format(DEFAULT_CHUNK_SIZE)
+            chunk_size = DEFAULT_CHUNK_SIZE if not use_fall_back_chunk_size else FALLBACK_CHUNK_SIZE
+            headers['Sforce-Enable-PKChunking'] = "true; chunkSize={}".format(chunk_size)
+            LOGGER.info(f"[use_fall_back_chunk_size:{use_fall_back_chunk_size}] HEADERS: {headers}")
 
             # If the stream ends with 'CleanInfo' or 'History', we can PK Chunk on the object's parent
             if any(catalog_entry['stream'].endswith(suffix) for suffix in ["CleanInfo", "History"]):

diff --git a/tap_salesforce/sync.py b/tap_salesforce/sync.py
@@ -204,10 +204,7 @@ def handle_ListView(sf,rec_id,sobject,lv_name,lv_catalog_entry,state,input_state
                 time_extracted=start_time))
 
 def sync_records(sf, catalog_entry, state, input_state, counter, catalog,config=None):
-    download_files = False
-    if "download_files" in config:
-        if config['download_files']==True:
-            download_files = True
+    download_files = config.get("download_files", False)
     chunked_bookmark = singer_utils.strptime_with_tz(sf.get_start_date(state, catalog_entry))
     stream = catalog_entry['stream']
     schema = catalog_entry['schema']