Skip to content

Commit

Permalink
Merge pull request #138 from tychy/find-word
Browse files Browse the repository at this point in the history
add find function
  • Loading branch information
tychy authored Sep 18, 2024
2 parents 74c1f37 + a5aefa0 commit 238834c
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 20 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ testdata
testdata.zip
.DS_Store

coverage.out
coverage.out
out.txt
14 changes: 9 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
BUCKET_NAME=toukibo-parser-samples
URL=https://pub-a26a7972d1ea437b983bf6696a7d847e.r2.dev
DATA_DIR=testdata
NUM_SAMPLE=778
export NUM_SAMPLE=1148

build:
mkdir -p bin
go build -o bin/toukibo-parser main.go

run: build
./bin/toukibo-parser -path=$(TARGET).pdf
./bin/toukibo-parser -mode=run -path=$(TARGET).pdf

run/sample: build
./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf"

./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf"

find/sample: build
./bin/toukibo-parser -mode=find -path="$(DATA_DIR)/pdf/$(TARGET).pdf" -target="$(FIND)"
find/all: build
./find-samples.sh

edit:
cat $(DATA_DIR)/yaml/$(TARGET).yaml
Expand All @@ -22,7 +26,7 @@ check:
make edit TARGET=$(TARGET)

annotate: build
./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml
./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml
make check TARGET=$(TARGET)

annotate/all: build
Expand Down
17 changes: 5 additions & 12 deletions annotate-samples.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
#!/bin/bash
NUM_SAMPLE=1148

SKIP_SAMPLES=(0)

# スクリプト全体でエラーが発生したら停止する
set -e

DATA_DIR=testdata

for ((i = 1; i <= NUM_SAMPLE; i++)); do
# サンプル番号がスキップリストに含まれていない場合に処理を実行
if ! [[ " ${SKIP_SAMPLES[*]} " =~ " $i " ]]; then
echo "sample$i"
mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml
./bin/toukibo-parser -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml
diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true
rm $DATA_DIR/yaml/bak_sample$i.yaml
fi
echo "sample$i"
mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml
./bin/toukibo-parser -mode="run" -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml
diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true
rm $DATA_DIR/yaml/bak_sample$i.yaml
done
10 changes: 10 additions & 0 deletions find-samples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# スクリプト全体でエラーが発生したら停止する
set -e

DATA_DIR=testdata
FIND=優先株式

for ((i = 1; i <= NUM_SAMPLE; i++)); do
./bin/toukibo-parser -mode="find" -path="$DATA_DIR/pdf/sample$i.pdf" -target="$FIND"
done
58 changes: 56 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,29 @@ import (
"github.com/tychy/toukibo-parser/toukibo"
)

var (
mode string
path string
target string
)

func main() {
f := flag.String("path", "testdata/pdf/sample1.pdf", "")
flag.StringVar(&mode, "mode", "run", "run or find")
flag.StringVar(&path, "path", "testdata/pdf/sample1.pdf", "pdf file path")
flag.StringVar(&target, "target", "", "")
flag.Parse()
path := fmt.Sprint(*f)

switch mode {
case "run":
mainRun()
case "find":
mainFind(target)
default:
fmt.Println("invalid mode")
}
}

func mainRun() {
content, err := readPdf(path)
if err != nil {
panic(err)
Expand Down Expand Up @@ -52,6 +71,41 @@ func main() {
return
}

func max(a, b int) int {
if a > b {
return a
}
return b
}

func min(a, b int) int {
if a < b {
return a
}
return b
}

func mainFind(s string) {
content, err := readPdf(path)
if err != nil {
panic(err)
}

if strings.Contains(content, s) {
fmt.Println("found in " + path)
// 前後を表示
for {
idx := strings.Index(content, s)
if idx == -1 {
break
}
fmt.Println(content[max(0, idx-60):min(len(content), idx+240)])
content = content[idx+1:]
}
}
return
}

func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
Expand Down

0 comments on commit 238834c

Please sign in to comment.