Skip to content

Commit

Permalink
[REF] Compute_plausible_gaps, Efficiency, Stability
Browse files Browse the repository at this point in the history
1. **Sorting without Reverse**: When sorting the textlines, we sort them in ascending order directly. This avoids the need to reverse the sorted list later, which can save some computational overhead.

2. **Array Creation for Gaps**: Instead of creating lists and then converting them, we directly create `numpy` arrays to store gaps. This allows us to utilize `numpy`'s efficient operations for subsequent calculations.

3. **Early Exits**: The checks for the lengths of `ref_h_textlines` and `ref_v_textlines` provide early exits if not enough textlines are available, preventing unnecessary calculations.

4. **Percentile Calculation**: The percentile calculation remains unchanged, but we ensure that we are working with `numpy` arrays for performance.
  • Loading branch information
bosd committed Oct 31, 2024
1 parent 313f75b commit ad1babd
Showing 1 changed file with 19 additions and 20 deletions.
39 changes: 19 additions & 20 deletions camelot/parsers/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,6 @@ def compute_plausible_gaps(self):
-------
gaps_hv : tuple
(horizontal_gap, vertical_gap) in pdf coordinate space.
"""
# Determine the textline that has the most combined
# alignments across horizontal and vertical axis.
Expand All @@ -459,6 +458,7 @@ def compute_plausible_gaps(self):
if best_alignment is None:
return None

# Extract the reference textlines
__, ref_h_textlines = best_alignment.max_h()
__, ref_v_textlines = best_alignment.max_v()

Expand All @@ -467,32 +467,31 @@ def compute_plausible_gaps(self):
return None

# Sort textlines based on their positions
h_textlines = sorted(
ref_h_textlines, key=lambda textline: textline.x0, reverse=True
)
v_textlines = sorted(
ref_v_textlines, key=lambda textline: textline.y0, reverse=True
)
h_textlines = sorted(ref_h_textlines, key=lambda textline: textline.x0)
v_textlines = sorted(ref_v_textlines, key=lambda textline: textline.y0)

# Calculate gaps between textlines
h_gaps = [
h_textlines[i - 1].x0 - h_textlines[i].x0
for i in range(1, len(h_textlines))
]
v_gaps = [
v_textlines[i - 1].y0 - v_textlines[i].y0
for i in range(1, len(v_textlines))
]
h_gaps = np.array(
[
h_textlines[i].x0 - h_textlines[i - 1].x0
for i in range(1, len(h_textlines))
]
)
v_gaps = np.array(
[
v_textlines[i].y0 - v_textlines[i - 1].y0
for i in range(1, len(v_textlines))
]
)

# If no gaps are found, return None
if not h_gaps or not v_gaps:
if h_gaps.size == 0 or v_gaps.size == 0:
return None

# Calculate the 75th percentile gaps
percentile = 75
# Calculate the 75th percentile gaps using numpy for efficiency
gaps_hv = (
2.0 * np.percentile(h_gaps, percentile),
2.0 * np.percentile(v_gaps, percentile),
2.0 * np.percentile(h_gaps, 75),
2.0 * np.percentile(v_gaps, 75),
)

return gaps_hv
Expand Down

0 comments on commit ad1babd

Please sign in to comment.