This repository has been archived by the owner on May 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
organized diff-sampling stuff into subdir
- Loading branch information
Showing
4 changed files
with
160 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Diff sampling | ||
|
||
When we make changes to data, we take a sample of edits to check whether they in fact do what we intend. This is accomplished by taking a random sample of the diffs and posting for quality controll before commiting and pushing changes with many edits to the repo. | ||
|
||
There is sometimes utility in dumping the whole diff to a local file for in depth exploration, manually or using `scripts/diff-sampling/diff-search.py` | ||
|
||
|
||
# General procedure | ||
|
||
Use the [sample-git-diffs](https://pypi.org/project/sample-git-diffs/) tool to generate a sample of changes made to the corpus data. | ||
|
||
``` | ||
sample-git-diffs --diffstat "git diff --stat -- corpus/protocols" --n 50 > <path/to/.diff-file> | ||
``` | ||
|
||
### diff-to-markdown | ||
|
||
Create a markdown file: | ||
|
||
``` | ||
diff2markdown --path <path/to/.diff-file> --username <of/repo> --repo <repo> --branch <branch> > <path/to/.md-file> | ||
``` | ||
|
||
### git add the sample | ||
|
||
Git add _only_ the files sampled | ||
|
||
``` | ||
python scripts/diff-sampling/git-add_diff-sample.py | ||
``` | ||
|
||
* commit and push | ||
* check links in markdown work | ||
* open PR | ||
* post sample.md in the comment | ||
* use git stash to save uncommited changes until the posted sample is deemed OK, then pop+add+commit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Search for something in a git diff. Return N instances and total diff lines. search is simple substr in str type | ||
""" | ||
import argparse, re | ||
|
||
|
||
|
||
|
||
def ck_line(line, req, q): | ||
if req: | ||
#print(fr"{q}") | ||
if re.search(fr"{q}", line): | ||
return True | ||
else: | ||
if q in line: | ||
return True | ||
return False | ||
|
||
|
||
|
||
|
||
def write_hits(hit_list, lines, outf): | ||
to_write = [] | ||
write_next = False | ||
for i, l in enumerate(lines): | ||
if l.startswith('diff '): | ||
write_next = False | ||
if l.startswith('index '): | ||
if l in hit_list: | ||
write_next = True | ||
to_write.append(lines[i-1]) | ||
to_write.append(l) | ||
else: | ||
if write_next: | ||
to_write.append(l) | ||
with open(outf, 'w+') as o: | ||
for _ in to_write: | ||
o.write(f"{_}"+"\n") | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def main(args): | ||
changes = 0 | ||
counter = 0 | ||
hit_indexes = [] | ||
with open(args.diff_file, 'r') as inf: | ||
rlines = inf.readlines() | ||
lines = [_.strip('\n') for _ in rlines] | ||
index_id = None | ||
for i, line in enumerate(lines, start=1): | ||
if line.startswith('index '): | ||
index_id = line | ||
|
||
if args.search_from: | ||
if line.startswith('- '): | ||
changes += 1 | ||
if ck_line(line, args.regex, args.search_from): | ||
if args.search_to: | ||
if ck_line(lines[i], args.regex, args.search_to): | ||
counter += 1 | ||
hit_indexes.append(index_id) | ||
if args.print_hit: | ||
print(f"{i} | {line}") | ||
print(f"{i+1} | {lines[i]}") | ||
else: | ||
counter += 1 | ||
hit_indexes.append(index_id) | ||
if args.print_hit: | ||
print(f"{i} | {line}") | ||
print(f"{i+1} | {lines[i]}") | ||
else: | ||
if line.startswith('+ '): | ||
changes += 1 | ||
if ck_line(line, args.regex, args.search_to): | ||
counter += 1 | ||
hit_indexes.append(index_id) | ||
if args.print_hit: | ||
print(f"{i-1} | {lines[i-2]}") | ||
print(f"{i} | {line}") | ||
|
||
if args.out_file: | ||
write_hits(hit_indexes, lines, args.out_file) | ||
|
||
print(changes, counter, counter/changes) | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("-d","--diff-file", required=True, help="Path to .diff file.") | ||
parser.add_argument("-s", "--search-from", type=str, default=None, | ||
help="Substring to search for in the change from line (`-`). If given together with `-S | --search-to`, results are given when matches are found in both lines." | ||
) | ||
parser.add_argument("-S", "--search-to", type=str, default=None, | ||
help="Substring to searhc for in the change to line (`+`)." | ||
) | ||
parser.add_argument("-b", "--search-both", type=str, default=None, | ||
help="Seach the same substring and `+` and `-` lines. Equivalent of setting the `-s` and `-S` args with the same input." | ||
) | ||
parser.add_argument("-r", "--regex", action="store_true", | ||
help="Treat search strings as literal in regex queries." | ||
) | ||
parser.add_argument('-p', '--print-hit', action='store_true', | ||
help='Print match results to console.' | ||
) | ||
parser.add_argument('-o', '--out-file', type=str, | ||
help="Write matched diffs to the given output file." | ||
) | ||
args = parser.parse_args() | ||
if args.search_from or args.search_to or args.search_both: | ||
if args.search_both: | ||
args.search_from = args.search_both | ||
args.search_to = args.search_both | ||
main(args) | ||
else: | ||
print("You need to use `-s` or `-S` (or both). Try again.") | ||
|
File renamed without changes.
This file was deleted.
Oops, something went wrong.