[+] Triangular diff

pull/32/head
Hykilpikonna 2022-11-16 02:13:51 -05:00
parent d752737c38
commit cb19df10a6
No known key found for this signature in database
GPG Key ID: 256CD01A41D7FA26
1 changed files with 107 additions and 6 deletions

View File

@ -1,11 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import difflib
import os import os
from pathlib import Path from pathlib import Path
from subprocess import check_output from subprocess import check_output
import opencc import opencc
from hypy_utils import write from hypy_utils import write
from hypy_utils.tqdm_utils import pmap from hypy_utils.tqdm_utils import pmap, smap
ALLOWED_DIRS = {Path(p) for p in ['people']} ALLOWED_DIRS = {Path(p) for p in ['people']}
ALLOWED_SUF = {'.json5', '.md'} ALLOWED_SUF = {'.json5', '.md'}
@ -15,6 +16,7 @@ HANS_TO_HANT = opencc.OpenCC('s2t.json')
D_SELF = Path(__file__).parent D_SELF = Path(__file__).parent
D_PAST = D_SELF / '.convert_past' D_PAST = D_SELF / '.convert_past'
D_LAST_HASH = D_PAST / 'last-hash.txt' D_LAST_HASH = D_PAST / 'last-hash.txt'
LAST_HASH = D_LAST_HASH.read_text().strip()
def list_files() -> set[Path]: def list_files() -> set[Path]:
@ -30,27 +32,126 @@ def list_files() -> set[Path]:
return files return files
def inline_diff(old: str, new: str) -> tuple[list[str], list[str], list[tuple[str, str]]]:
matcher = difflib.SequenceMatcher(None, old, new)
a: list[str] = []
d: list[str] = []
r: list[tuple[str, str]] = []
def find_change(tag, s0, e0, s1, e1):
# Tag can be replace, delete, insert, equal
if tag == 'replace':
r.append((old[s0:e0], new[s1:e1]))
if tag == 'delete':
d.append(old[s0:e0])
if tag == 'insert':
a.append(new[s1:e1])
for t in matcher.get_opcodes():
find_change(*t)
return a, d, r
def inline_diff_apply(old: str, new: str, alt: str) -> str:
"""
Apply inline diff between two strings to an alternative string
Changes between new and old will be applied to alt, while changes in alt will not be removed.
:param old: Old string
:param new: New string
:param alt: Old alternative string to apply to
:return: New alternative string
"""
# Find differences between old and new
a, d, r = inline_diff(old, new)
# Find differences between alt and new, apply differences that are present between old and new
matcher = difflib.SequenceMatcher(None, alt, new)
inc = 0
for tag, s0, e0, s1, e1 in matcher.get_opcodes():
s0 += inc
e0 += inc
# Tag can be replace, delete, insert, equal
if tag == 'replace':
df = (alt[s0:e0], new[s1:e1])
if df not in r:
continue
print(f'[Diff] Applying [U] {repr(df)}')
alt = alt[:s0] + new[s1:e1] + alt[e0:]
inc += (e1 - s1) - (e0 - s0)
if tag == 'delete':
if alt[s0:e0] not in d:
continue
print(f'[Diff] Applying [-] {repr(alt[s0:e0])}')
alt = alt[:s0] + alt[e0:]
inc -= e0 - s0
if tag == 'insert':
if new[s1:e1] not in a:
continue
print(f'[Diff] Applying [+] {repr(new[s1:e1])}')
alt = alt[:s0] + new[s1:e1] + alt[s0:]
inc += e1 - s1
return alt
def process_file(f: Path): def process_file(f: Path):
if '.zh_hant.' in f.name: if '.zh_hant.' in f.name:
return return
hans = f.read_text()
converted = HANS_TO_HANT.convert(hans)
f_hant = f.with_name(f'{f.stem}.zh_hant{f.suffix}') f_hant = f.with_name(f'{f.stem}.zh_hant{f.suffix}')
if not f_hant.is_file(): if not f_hant.is_file():
# If hant file doesn't exist, create # If hant file doesn't exist, create
f_hant.write_text(HANS_TO_HANT.convert(f.read_text())) f_hant.write_text(converted)
else: else:
# TODO hant_current = f_hant.read_text()
pass
# Hant file exists, use diff
# Obtain original version from git
past = check_output(['git', 'show', f"{LAST_HASH}:{f.relative_to('.')}"]).decode()
# Nothing changed, skip
if past == hans:
return
print(f"\n============ CHANGED FILE: {f} ============")
print("> Trying to apply diff...")
# Diff: Obtain a list of inline differences from the HANS change (converted to HANT)
a, d, r = inline_diff(HANS_TO_HANT.convert(past), converted)
print('> Diff from old to new:', a, d, r)
a, d, r = inline_diff(hant_current, converted)
print('> Diff from hant to new:', a, d, r)
hant_new = inline_diff_apply(HANS_TO_HANT.convert(past), converted, hant_current)
f_hant.write_text(hant_new)
a, d, r = inline_diff(hant_new, converted)
print('> Diff from hant_new to new:', a, d, r)
print(f"============ DONE ============")
if __name__ == '__main__': if __name__ == '__main__':
# Process files # Process files
pmap(process_file, list_files()) smap(process_file, list_files())
# Write last hash # Write last hash
last_commit = check_output(['git', 'rev-parse', 'HEAD']) last_commit = check_output(['git', 'rev-parse', 'HEAD']).decode()
write(D_LAST_HASH, last_commit) write(D_LAST_HASH, last_commit)
print('Done') print('Done')