[REF] Compute_plausible_gaps, Efficiency, Stability

1. **Sorting without Reverse**: When sorting the textlines, we sort them in ascending order directly. This avoids the need to reverse the sorted list later, which can save some computational overhead. 2. **Array Creation for Gaps**: Instead of creating lists and then converting them, we directly create `numpy` arrays to store gaps. This allows us to utilize `numpy`'s efficient operations for subsequent calculations. 3. **Early Exits**: The checks for the lengths of `ref_h_textlines` and `ref_v_textlines` provide early exits if not enough textlines are available, preventing unnecessary calculations. 4. **Percentile Calculation**: The percentile calculation remains unchanged, but we ensure that we are working with `numpy` arrays for performance.
py-pdf · Oct 31, 2024 · ad1babd · ad1babd
1 parent 313f75b
commit ad1babd
Showing 1 changed file with 19 additions and 20 deletions.
diff --git a/camelot/parsers/network.py b/camelot/parsers/network.py
@@ -446,7 +446,6 @@ def compute_plausible_gaps(self):
         -------
         gaps_hv : tuple
             (horizontal_gap, vertical_gap) in pdf coordinate space.
-
         """
         # Determine the textline that has the most combined
         # alignments across horizontal and vertical axis.
@@ -459,6 +458,7 @@ def compute_plausible_gaps(self):
         if best_alignment is None:
             return None
 
+        # Extract the reference textlines
         __, ref_h_textlines = best_alignment.max_h()
         __, ref_v_textlines = best_alignment.max_v()
 
@@ -467,32 +467,31 @@ def compute_plausible_gaps(self):
             return None
 
         # Sort textlines based on their positions
-        h_textlines = sorted(
-            ref_h_textlines, key=lambda textline: textline.x0, reverse=True
-        )
-        v_textlines = sorted(
-            ref_v_textlines, key=lambda textline: textline.y0, reverse=True
-        )
+        h_textlines = sorted(ref_h_textlines, key=lambda textline: textline.x0)
+        v_textlines = sorted(ref_v_textlines, key=lambda textline: textline.y0)
 
         # Calculate gaps between textlines
-        h_gaps = [
-            h_textlines[i - 1].x0 - h_textlines[i].x0
-            for i in range(1, len(h_textlines))
-        ]
-        v_gaps = [
-            v_textlines[i - 1].y0 - v_textlines[i].y0
-            for i in range(1, len(v_textlines))
-        ]
+        h_gaps = np.array(
+            [
+                h_textlines[i].x0 - h_textlines[i - 1].x0
+                for i in range(1, len(h_textlines))
+            ]
+        )
+        v_gaps = np.array(
+            [
+                v_textlines[i].y0 - v_textlines[i - 1].y0
+                for i in range(1, len(v_textlines))
+            ]
+        )
 
         # If no gaps are found, return None
-        if not h_gaps or not v_gaps:
+        if h_gaps.size == 0 or v_gaps.size == 0:
             return None
 
-        # Calculate the 75th percentile gaps
-        percentile = 75
+        # Calculate the 75th percentile gaps using numpy for efficiency
         gaps_hv = (
-            2.0 * np.percentile(h_gaps, percentile),
-            2.0 * np.percentile(v_gaps, percentile),
+            2.0 * np.percentile(h_gaps, 75),
+            2.0 * np.percentile(v_gaps, 75),
         )
 
         return gaps_hv