diff --git a/.gitignore b/.gitignore index 21a8111..d41b8f4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ testdata testdata.zip .DS_Store -coverage.out \ No newline at end of file +coverage.out +out.txt diff --git a/Makefile b/Makefile index 5360c38..512ac17 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,22 @@ BUCKET_NAME=toukibo-parser-samples URL=https://pub-a26a7972d1ea437b983bf6696a7d847e.r2.dev DATA_DIR=testdata -NUM_SAMPLE=778 +export NUM_SAMPLE=1148 build: mkdir -p bin go build -o bin/toukibo-parser main.go run: build - ./bin/toukibo-parser -path=$(TARGET).pdf + ./bin/toukibo-parser -mode=run -path=$(TARGET).pdf run/sample: build - ./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf" - + ./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf" + +find/sample: build + ./bin/toukibo-parser -mode=find -path="$(DATA_DIR)/pdf/$(TARGET).pdf" -target="$(FIND)" +find/all: build + ./find-samples.sh edit: cat $(DATA_DIR)/yaml/$(TARGET).yaml @@ -22,7 +26,7 @@ check: make edit TARGET=$(TARGET) annotate: build - ./bin/toukibo-parser -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml + ./bin/toukibo-parser -mode=run -path="$(DATA_DIR)/pdf/$(TARGET).pdf" > $(DATA_DIR)/yaml/$(TARGET).yaml make check TARGET=$(TARGET) annotate/all: build diff --git a/annotate-samples.sh b/annotate-samples.sh index 2206b54..a3cd1a9 100755 --- a/annotate-samples.sh +++ b/annotate-samples.sh @@ -1,20 +1,13 @@ #!/bin/bash -NUM_SAMPLE=1148 - -SKIP_SAMPLES=(0) - # スクリプト全体でエラーが発生したら停止する set -e DATA_DIR=testdata for ((i = 1; i <= NUM_SAMPLE; i++)); do - # サンプル番号がスキップリストに含まれていない場合に処理を実行 - if ! [[ " ${SKIP_SAMPLES[*]} " =~ " $i " ]]; then - echo "sample$i" - mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml - ./bin/toukibo-parser -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml - diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true - rm $DATA_DIR/yaml/bak_sample$i.yaml - fi + echo "sample$i" + mv $DATA_DIR/yaml/sample$i.yaml $DATA_DIR/yaml/bak_sample$i.yaml + ./bin/toukibo-parser -mode="run" -path="$DATA_DIR/pdf/sample$i.pdf" > $DATA_DIR/yaml/sample$i.yaml + diff ./$DATA_DIR/yaml/bak_sample$i.yaml ./$DATA_DIR/yaml/sample$i.yaml || true + rm $DATA_DIR/yaml/bak_sample$i.yaml done diff --git a/find-samples.sh b/find-samples.sh new file mode 100755 index 0000000..c96f3ad --- /dev/null +++ b/find-samples.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# スクリプト全体でエラーが発生したら停止する +set -e + +DATA_DIR=testdata +FIND=優先株式 + +for ((i = 1; i <= NUM_SAMPLE; i++)); do + ./bin/toukibo-parser -mode="find" -path="$DATA_DIR/pdf/sample$i.pdf" -target="$FIND" +done diff --git a/main.go b/main.go index fee4aae..5867e24 100644 --- a/main.go +++ b/main.go @@ -10,10 +10,29 @@ import ( "github.com/tychy/toukibo-parser/toukibo" ) +var ( + mode string + path string + target string +) + func main() { - f := flag.String("path", "testdata/pdf/sample1.pdf", "") + flag.StringVar(&mode, "mode", "run", "run or find") + flag.StringVar(&path, "path", "testdata/pdf/sample1.pdf", "pdf file path") + flag.StringVar(&target, "target", "", "") flag.Parse() - path := fmt.Sprint(*f) + + switch mode { + case "run": + mainRun() + case "find": + mainFind(target) + default: + fmt.Println("invalid mode") + } +} + +func mainRun() { content, err := readPdf(path) if err != nil { panic(err) @@ -52,6 +71,41 @@ func main() { return } +func max(a, b int) int { + if a > b { + return a + } + return b +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +func mainFind(s string) { + content, err := readPdf(path) + if err != nil { + panic(err) + } + + if strings.Contains(content, s) { + fmt.Println("found in " + path) + // 前後を表示 + for { + idx := strings.Index(content, s) + if idx == -1 { + break + } + fmt.Println(content[max(0, idx-60):min(len(content), idx+240)]) + content = content[idx+1:] + } + } + return +} + func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil {