190 lines
4.7 KiB
Python
190 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
import re
|
|
from pathlib import Path
|
|
|
|
URL_REGEX = re.compile(r"https?://[^\s)]+")
|
|
FENCE_OPEN_REGEX = re.compile(r"^(\s{0,3})(`{3,}|~{3,})(.*)$")
|
|
HEADING_REGEX = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
|
|
BULLET_REGEX = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
|
|
|
|
# crude but effective path detection
|
|
# Requires either a path prefix (./ ../ / or drive letter) or a slash/backslash within the match
|
|
PATH_REGEX = re.compile(r"(?:\./|\.\./|/|[A-Za-z]:\\)[\w\-/\\\.]+|[\w\-\.]+[/\\][\w\-/\\\.]+")
|
|
|
|
|
|
class ValidationResult:
|
|
def __init__(self):
|
|
self.is_valid = True
|
|
self.errors = []
|
|
self.warnings = []
|
|
|
|
def add_error(self, msg):
|
|
self.is_valid = False
|
|
self.errors.append(msg)
|
|
|
|
def add_warning(self, msg):
|
|
self.warnings.append(msg)
|
|
|
|
|
|
def read_file(path: Path) -> str:
|
|
return path.read_text(errors="ignore")
|
|
|
|
|
|
# ---------- Extractors ----------
|
|
|
|
|
|
def extract_headings(text):
|
|
return [(level, title.strip()) for level, title in HEADING_REGEX.findall(text)]
|
|
|
|
|
|
def extract_code_blocks(text):
|
|
"""Line-based fenced code block extractor.
|
|
|
|
Handles ``` and ~~~ fences with variable length (CommonMark: closing
|
|
fence must use same char and be at least as long as opening). Supports
|
|
nested fences (e.g. an outer 4-backtick block wrapping inner 3-backtick
|
|
content).
|
|
"""
|
|
blocks = []
|
|
lines = text.split("\n")
|
|
i = 0
|
|
n = len(lines)
|
|
while i < n:
|
|
m = FENCE_OPEN_REGEX.match(lines[i])
|
|
if not m:
|
|
i += 1
|
|
continue
|
|
fence_char = m.group(2)[0]
|
|
fence_len = len(m.group(2))
|
|
open_line = lines[i]
|
|
block_lines = [open_line]
|
|
i += 1
|
|
closed = False
|
|
while i < n:
|
|
close_m = FENCE_OPEN_REGEX.match(lines[i])
|
|
if (
|
|
close_m
|
|
and close_m.group(2)[0] == fence_char
|
|
and len(close_m.group(2)) >= fence_len
|
|
and close_m.group(3).strip() == ""
|
|
):
|
|
block_lines.append(lines[i])
|
|
closed = True
|
|
i += 1
|
|
break
|
|
block_lines.append(lines[i])
|
|
i += 1
|
|
if closed:
|
|
blocks.append("\n".join(block_lines))
|
|
# Unclosed fences are silently skipped — they indicate malformed markdown
|
|
# and including them would cause false-positive validation failures.
|
|
return blocks
|
|
|
|
|
|
def extract_urls(text):
|
|
return set(URL_REGEX.findall(text))
|
|
|
|
|
|
def extract_paths(text):
|
|
return set(PATH_REGEX.findall(text))
|
|
|
|
|
|
def count_bullets(text):
|
|
return len(BULLET_REGEX.findall(text))
|
|
|
|
|
|
# ---------- Validators ----------
|
|
|
|
|
|
def validate_headings(orig, comp, result):
|
|
h1 = extract_headings(orig)
|
|
h2 = extract_headings(comp)
|
|
|
|
if len(h1) != len(h2):
|
|
result.add_error(f"Heading count mismatch: {len(h1)} vs {len(h2)}")
|
|
|
|
if h1 != h2:
|
|
result.add_warning("Heading text/order changed")
|
|
|
|
|
|
def validate_code_blocks(orig, comp, result):
|
|
c1 = extract_code_blocks(orig)
|
|
c2 = extract_code_blocks(comp)
|
|
|
|
if c1 != c2:
|
|
result.add_error("Code blocks not preserved exactly")
|
|
|
|
|
|
def validate_urls(orig, comp, result):
|
|
u1 = extract_urls(orig)
|
|
u2 = extract_urls(comp)
|
|
|
|
if u1 != u2:
|
|
result.add_error(f"URL mismatch: lost={u1 - u2}, added={u2 - u1}")
|
|
|
|
|
|
def validate_paths(orig, comp, result):
|
|
p1 = extract_paths(orig)
|
|
p2 = extract_paths(comp)
|
|
|
|
if p1 != p2:
|
|
result.add_warning(f"Path mismatch: lost={p1 - p2}, added={p2 - p1}")
|
|
|
|
|
|
def validate_bullets(orig, comp, result):
|
|
b1 = count_bullets(orig)
|
|
b2 = count_bullets(comp)
|
|
|
|
if b1 == 0:
|
|
return
|
|
|
|
diff = abs(b1 - b2) / b1
|
|
|
|
if diff > 0.15:
|
|
result.add_warning(f"Bullet count changed too much: {b1} -> {b2}")
|
|
|
|
|
|
# ---------- Main ----------
|
|
|
|
|
|
def validate(original_path: Path, compressed_path: Path) -> ValidationResult:
|
|
result = ValidationResult()
|
|
|
|
orig = read_file(original_path)
|
|
comp = read_file(compressed_path)
|
|
|
|
validate_headings(orig, comp, result)
|
|
validate_code_blocks(orig, comp, result)
|
|
validate_urls(orig, comp, result)
|
|
validate_paths(orig, comp, result)
|
|
validate_bullets(orig, comp, result)
|
|
|
|
return result
|
|
|
|
|
|
# ---------- CLI ----------
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python validate.py <original> <compressed>")
|
|
sys.exit(1)
|
|
|
|
orig = Path(sys.argv[1]).resolve()
|
|
comp = Path(sys.argv[2]).resolve()
|
|
|
|
res = validate(orig, comp)
|
|
|
|
print(f"\nValid: {res.is_valid}")
|
|
|
|
if res.errors:
|
|
print("\nErrors:")
|
|
for e in res.errors:
|
|
print(f" - {e}")
|
|
|
|
if res.warnings:
|
|
print("\nWarnings:")
|
|
for w in res.warnings:
|
|
print(f" - {w}")
|