diff --git a/scripts/diff-sampling/README.md b/scripts/diff-sampling/README.md new file mode 100644 index 0000000000..f9fe3c4245 --- /dev/null +++ b/scripts/diff-sampling/README.md @@ -0,0 +1,36 @@ +# Diff sampling + +When we make changes to data, we take a sample of edits to check whether they in fact do what we intend. This is accomplished by taking a random sample of the diffs and posting for quality controll before commiting and pushing changes with many edits to the repo. + +There is sometimes utility in dumping the whole diff to a local file for in depth exploration, manually or using `scripts/diff-sampling/diff-search.py` + + +# General procedure + +Use the [sample-git-diffs](https://pypi.org/project/sample-git-diffs/) tool to generate a sample of changes made to the corpus data. + +``` +sample-git-diffs --diffstat "git diff --stat -- corpus/protocols" --n 50 > +``` + +### diff-to-markdown + +Create a markdown file: + +``` +diff2markdown --path --username --repo --branch > +``` + +### git add the sample + +Git add _only_ the files sampled + +``` +python scripts/diff-sampling/git-add_diff-sample.py +``` + +* commit and push +* check links in markdown work +* open PR +* post sample.md in the comment +* use git stash to save uncommited changes until the posted sample is deemed OK, then pop+add+commit \ No newline at end of file diff --git a/scripts/diff-sampling/diff-search.py b/scripts/diff-sampling/diff-search.py new file mode 100644 index 0000000000..d0aee5393e --- /dev/null +++ b/scripts/diff-sampling/diff-search.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Search for something in a git diff. Return N instances and total diff lines. search is simple substr in str type + +""" +import argparse, re + + + + +def ck_line(line, req, q): + if req: + #print(fr"{q}") + if re.search(fr"{q}", line): + return True + else: + if q in line: + return True + return False + + + + +def write_hits(hit_list, lines, outf): + to_write = [] + write_next = False + for i, l in enumerate(lines): + if l.startswith('diff '): + write_next = False + if l.startswith('index '): + if l in hit_list: + write_next = True + to_write.append(lines[i-1]) + to_write.append(l) + else: + if write_next: + to_write.append(l) + with open(outf, 'w+') as o: + for _ in to_write: + o.write(f"{_}"+"\n") + + + + + + + +def main(args): + changes = 0 + counter = 0 + hit_indexes = [] + with open(args.diff_file, 'r') as inf: + rlines = inf.readlines() + lines = [_.strip('\n') for _ in rlines] + index_id = None + for i, line in enumerate(lines, start=1): + if line.startswith('index '): + index_id = line + + if args.search_from: + if line.startswith('- '): + changes += 1 + if ck_line(line, args.regex, args.search_from): + if args.search_to: + if ck_line(lines[i], args.regex, args.search_to): + counter += 1 + hit_indexes.append(index_id) + if args.print_hit: + print(f"{i} | {line}") + print(f"{i+1} | {lines[i]}") + else: + counter += 1 + hit_indexes.append(index_id) + if args.print_hit: + print(f"{i} | {line}") + print(f"{i+1} | {lines[i]}") + else: + if line.startswith('+ '): + changes += 1 + if ck_line(line, args.regex, args.search_to): + counter += 1 + hit_indexes.append(index_id) + if args.print_hit: + print(f"{i-1} | {lines[i-2]}") + print(f"{i} | {line}") + + if args.out_file: + write_hits(hit_indexes, lines, args.out_file) + + print(changes, counter, counter/changes) + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-d","--diff-file", required=True, help="Path to .diff file.") + parser.add_argument("-s", "--search-from", type=str, default=None, + help="Substring to search for in the change from line (`-`). If given together with `-S | --search-to`, results are given when matches are found in both lines." + ) + parser.add_argument("-S", "--search-to", type=str, default=None, + help="Substring to searhc for in the change to line (`+`)." + ) + parser.add_argument("-b", "--search-both", type=str, default=None, + help="Seach the same substring and `+` and `-` lines. Equivalent of setting the `-s` and `-S` args with the same input." + ) + parser.add_argument("-r", "--regex", action="store_true", + help="Treat search strings as literal in regex queries." + ) + parser.add_argument('-p', '--print-hit', action='store_true', + help='Print match results to console.' + ) + parser.add_argument('-o', '--out-file', type=str, + help="Write matched diffs to the given output file." + ) + args = parser.parse_args() + if args.search_from or args.search_to or args.search_both: + if args.search_both: + args.search_from = args.search_both + args.search_to = args.search_both + main(args) + else: + print("You need to use `-s` or `-S` (or both). Try again.") + diff --git a/scripts/git-add_diff-sample.py b/scripts/diff-sampling/git-add_diff-sample.py similarity index 100% rename from scripts/git-add_diff-sample.py rename to scripts/diff-sampling/git-add_diff-sample.py diff --git a/scripts/git-add_QC-sample.sh b/scripts/git-add_QC-sample.sh deleted file mode 100755 index d7fa3612a6..0000000000 --- a/scripts/git-add_QC-sample.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -read -r -d '' DOCSTR << EOM -_____________________________________________________ - < After creating a qualtiy control sample, > - < this script stages the selected files in git... > - < for optimal laziness. > - < > - < But you need to provide a single argument, that > - < is, the first year of the decade sample. > - < > - < Run from project root as with the python scripts. > - ------------------------------------------------------ - \ ___-------___ - \ _-~~ ~~-_ - \ _-~ /~-_ - /^\__/^\ /~ \ / \\ - /| o|| o| / \_______________/ \\ - | |___||__| / / \ \\ - | \ / / \ \\ - | (_______) /______/ \_________ \\ - | / / \ / \\ - \ \^\\ \ / \ / - \ || \______________/ _-_ //\__// - \ ||------_-~~-_ ------------- \ --/~ ~\ || __/ - ~-----||====/~ |==================| |/~~~~~ - (_(__/ ./ / \_\ \. - (_(___/ \_____)_) - -EOM - -if [ -z ${1+x} ]; then - echo "$DOCSTR" -else - -sample_file=input/quality-control/sample_$1.txt - -if [ -f $sample_file ]; then - - echo "Input $sample_file exists." - while read -r line - do - echo "git adding $line" - git add "$line" - done < "$sample_file" - -else - - read -r -d '' ERRSTR << EOM - ___________________________________________________________________ - | You turkey! | - | No input file at $sample_file exists. | - | Find your input file and try again! | - ------------------------------------------------------------------ - \ ,+*^^*+___+++_X - \ ,*^^^^ ) - \ _+* ^**+_ - \ +^ _ _++*+_+++_, ) - _+^^*+_ ( ,+*^ ^ \+_ ) - { ) ( ,( ,_+--+--, ^) ^\\ - { (@) } f ,( ,+-^ __*_*_ ^^\_ ^\ ) - {:;-/ (_+*-+^^^^^+*+*<_ _++_)_ ) ) / - ( / ( ( ,___ ^*+_+* ) < < \\ - U _/ ) *--< ) ^\-----++__) ) ) ) - ( ) _(^)^^)) ) )\^^^^^))^*+/ / / - ( / (_))_^)) ) ) ))^^^^^))^^^)__/ +^^ - ( ,/ (^))^)) ) ) ))^^^^^^^))^^) _) - *+__+* (_))^) ) ) ))^^^^^^))^^^^^)____*^ - \ \_)^)_)) ))^^^^^^^^^^))^^^^) - (_ ^\__^^^^^^^^^^^^))^^^^^^^) - ^\___ ^\__^^^^^^))^^^^^^^^)\\ - ^^^^^\uuu/^^\uuu/^^^^\^\^\^\^\^\^\^\\ - ___) >____) >___ ^\_\_\_\_\_\_\) - ^^^//\\_^^//\\_^ ^(\_\_\_\) - ^^^ ^^ ^^^ ^ -EOM - echo "$ERRSTR" - -fi -fi