From ad1babdbbad7145bf6d543577504445b5a44483e Mon Sep 17 00:00:00 2001 From: bosd Date: Thu, 31 Oct 2024 21:39:40 +0100 Subject: [PATCH] [REF] Compute_plausible_gaps, Efficiency, Stability 1. **Sorting without Reverse**: When sorting the textlines, we sort them in ascending order directly. This avoids the need to reverse the sorted list later, which can save some computational overhead. 2. **Array Creation for Gaps**: Instead of creating lists and then converting them, we directly create `numpy` arrays to store gaps. This allows us to utilize `numpy`'s efficient operations for subsequent calculations. 3. **Early Exits**: The checks for the lengths of `ref_h_textlines` and `ref_v_textlines` provide early exits if not enough textlines are available, preventing unnecessary calculations. 4. **Percentile Calculation**: The percentile calculation remains unchanged, but we ensure that we are working with `numpy` arrays for performance. --- camelot/parsers/network.py | 39 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/camelot/parsers/network.py b/camelot/parsers/network.py index f538ce69..c78a0138 100644 --- a/camelot/parsers/network.py +++ b/camelot/parsers/network.py @@ -446,7 +446,6 @@ def compute_plausible_gaps(self): ------- gaps_hv : tuple (horizontal_gap, vertical_gap) in pdf coordinate space. - """ # Determine the textline that has the most combined # alignments across horizontal and vertical axis. @@ -459,6 +458,7 @@ def compute_plausible_gaps(self): if best_alignment is None: return None + # Extract the reference textlines __, ref_h_textlines = best_alignment.max_h() __, ref_v_textlines = best_alignment.max_v() @@ -467,32 +467,31 @@ def compute_plausible_gaps(self): return None # Sort textlines based on their positions - h_textlines = sorted( - ref_h_textlines, key=lambda textline: textline.x0, reverse=True - ) - v_textlines = sorted( - ref_v_textlines, key=lambda textline: textline.y0, reverse=True - ) + h_textlines = sorted(ref_h_textlines, key=lambda textline: textline.x0) + v_textlines = sorted(ref_v_textlines, key=lambda textline: textline.y0) # Calculate gaps between textlines - h_gaps = [ - h_textlines[i - 1].x0 - h_textlines[i].x0 - for i in range(1, len(h_textlines)) - ] - v_gaps = [ - v_textlines[i - 1].y0 - v_textlines[i].y0 - for i in range(1, len(v_textlines)) - ] + h_gaps = np.array( + [ + h_textlines[i].x0 - h_textlines[i - 1].x0 + for i in range(1, len(h_textlines)) + ] + ) + v_gaps = np.array( + [ + v_textlines[i].y0 - v_textlines[i - 1].y0 + for i in range(1, len(v_textlines)) + ] + ) # If no gaps are found, return None - if not h_gaps or not v_gaps: + if h_gaps.size == 0 or v_gaps.size == 0: return None - # Calculate the 75th percentile gaps - percentile = 75 + # Calculate the 75th percentile gaps using numpy for efficiency gaps_hv = ( - 2.0 * np.percentile(h_gaps, percentile), - 2.0 * np.percentile(v_gaps, percentile), + 2.0 * np.percentile(h_gaps, 75), + 2.0 * np.percentile(v_gaps, 75), ) return gaps_hv