Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add find function #138

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ testdata
testdata.zip
.DS_Store

coverage.out
coverage.out
out.txt
14 changes: 9 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
BUCKET_NAME=toukibo-parser-samples
URL=https://pub-a26a7972d1ea437b983bf6696a7d847e.r2.dev
DATA_DIR=testdata
NUM_SAMPLE=778
export NUM_SAMPLE=1148

build:
mkdir -p bin
go build -o bin/toukibo-parser main.go

run: build
./bin/toukibo-parser -path=$(TARGET).pdf
./bin/toukibo-parser -mode=run -path=$(TARGET).pdf

run/sample: build
./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf"

./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf"

find/sample: build
./bin/toukibo-parser -mode=find -path="$(DATA_DIR)/pdf/$(TARGET).pdf" -target="$(FIND)"
find/all: build
./find-samples.sh

edit:
cat $(DATA_DIR)/yaml/$(TARGET).yaml
Expand All @@ -22,7 +26,7 @@ check:
make edit TARGET=$(TARGET)

annotate: build
./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml
./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml
make check TARGET=$(TARGET)

annotate/all: build
Expand Down
17 changes: 5 additions & 12 deletions annotate-samples.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
#!/bin/bash
NUM_SAMPLE=1148

SKIP_SAMPLES=(0)

# スクリプト全体でエラーが発生したら停止する
set -e

DATA_DIR=testdata

for ((i = 1; i <= NUM_SAMPLE; i++)); do
# サンプル番号がスキップリストに含まれていない場合に処理を実行
if ! [[ " ${SKIP_SAMPLES[*]} " =~ " $i " ]]; then
echo "sample$i"
mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml
./bin/toukibo-parser -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml
diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true
rm $DATA_DIR/yaml/bak_sample$i.yaml
fi
echo "sample$i"
mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml
./bin/toukibo-parser -mode="run" -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml
diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true
rm $DATA_DIR/yaml/bak_sample$i.yaml
done
10 changes: 10 additions & 0 deletions find-samples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# スクリプト全体でエラーが発生したら停止する
set -e

DATA_DIR=testdata
FIND=優先株式

for ((i = 1; i <= NUM_SAMPLE; i++)); do
./bin/toukibo-parser -mode="find" -path="$DATA_DIR/pdf/sample$i.pdf" -target="$FIND"
done
58 changes: 56 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,29 @@ import (
"github.com/tychy/toukibo-parser/toukibo"
)

var (
mode string
path string
target string
)

func main() {
f := flag.String("path", "testdata/pdf/sample1.pdf", "")
flag.StringVar(&mode, "mode", "run", "run or find")
flag.StringVar(&path, "path", "testdata/pdf/sample1.pdf", "pdf file path")
flag.StringVar(&target, "target", "", "")
flag.Parse()
path := fmt.Sprint(*f)

switch mode {
case "run":
mainRun()
case "find":
mainFind(target)
default:
fmt.Println("invalid mode")
}
}

func mainRun() {
content, err := readPdf(path)
if err != nil {
panic(err)
Expand Down Expand Up @@ -52,6 +71,41 @@ func main() {
return
}

func max(a, b int) int {
if a > b {
return a
}
return b
}

func min(a, b int) int {
if a < b {
return a
}
return b
}

func mainFind(s string) {
content, err := readPdf(path)
if err != nil {
panic(err)
}

if strings.Contains(content, s) {
fmt.Println("found in " + path)
// 前後を表示
for {
idx := strings.Index(content, s)
if idx == -1 {
break
}
fmt.Println(content[max(0, idx-60):min(len(content), idx+240)])
content = content[idx+1:]
}
}
return
}

func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
Expand Down
Loading