Files
pi-skills/caveman-compress/scripts/validate.py
alex wiesner 5d5d0e2d26 updates
2026-04-12 06:47:14 +01:00

190 lines
4.7 KiB
Python

#!/usr/bin/env python3
import re
from pathlib import Path
URL_REGEX = re.compile(r"https?://[^\s)]+")
FENCE_OPEN_REGEX = re.compile(r"^(\s{0,3})(`{3,}|~{3,})(.*)$")
HEADING_REGEX = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
BULLET_REGEX = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
# crude but effective path detection
# Requires either a path prefix (./ ../ / or drive letter) or a slash/backslash within the match
PATH_REGEX = re.compile(r"(?:\./|\.\./|/|[A-Za-z]:\\)[\w\-/\\\.]+|[\w\-\.]+[/\\][\w\-/\\\.]+")
class ValidationResult:
def __init__(self):
self.is_valid = True
self.errors = []
self.warnings = []
def add_error(self, msg):
self.is_valid = False
self.errors.append(msg)
def add_warning(self, msg):
self.warnings.append(msg)
def read_file(path: Path) -> str:
return path.read_text(errors="ignore")
# ---------- Extractors ----------
def extract_headings(text):
return [(level, title.strip()) for level, title in HEADING_REGEX.findall(text)]
def extract_code_blocks(text):
"""Line-based fenced code block extractor.
Handles ``` and ~~~ fences with variable length (CommonMark: closing
fence must use same char and be at least as long as opening). Supports
nested fences (e.g. an outer 4-backtick block wrapping inner 3-backtick
content).
"""
blocks = []
lines = text.split("\n")
i = 0
n = len(lines)
while i < n:
m = FENCE_OPEN_REGEX.match(lines[i])
if not m:
i += 1
continue
fence_char = m.group(2)[0]
fence_len = len(m.group(2))
open_line = lines[i]
block_lines = [open_line]
i += 1
closed = False
while i < n:
close_m = FENCE_OPEN_REGEX.match(lines[i])
if (
close_m
and close_m.group(2)[0] == fence_char
and len(close_m.group(2)) >= fence_len
and close_m.group(3).strip() == ""
):
block_lines.append(lines[i])
closed = True
i += 1
break
block_lines.append(lines[i])
i += 1
if closed:
blocks.append("\n".join(block_lines))
# Unclosed fences are silently skipped — they indicate malformed markdown
# and including them would cause false-positive validation failures.
return blocks
def extract_urls(text):
return set(URL_REGEX.findall(text))
def extract_paths(text):
return set(PATH_REGEX.findall(text))
def count_bullets(text):
return len(BULLET_REGEX.findall(text))
# ---------- Validators ----------
def validate_headings(orig, comp, result):
h1 = extract_headings(orig)
h2 = extract_headings(comp)
if len(h1) != len(h2):
result.add_error(f"Heading count mismatch: {len(h1)} vs {len(h2)}")
if h1 != h2:
result.add_warning("Heading text/order changed")
def validate_code_blocks(orig, comp, result):
c1 = extract_code_blocks(orig)
c2 = extract_code_blocks(comp)
if c1 != c2:
result.add_error("Code blocks not preserved exactly")
def validate_urls(orig, comp, result):
u1 = extract_urls(orig)
u2 = extract_urls(comp)
if u1 != u2:
result.add_error(f"URL mismatch: lost={u1 - u2}, added={u2 - u1}")
def validate_paths(orig, comp, result):
p1 = extract_paths(orig)
p2 = extract_paths(comp)
if p1 != p2:
result.add_warning(f"Path mismatch: lost={p1 - p2}, added={p2 - p1}")
def validate_bullets(orig, comp, result):
b1 = count_bullets(orig)
b2 = count_bullets(comp)
if b1 == 0:
return
diff = abs(b1 - b2) / b1
if diff > 0.15:
result.add_warning(f"Bullet count changed too much: {b1} -> {b2}")
# ---------- Main ----------
def validate(original_path: Path, compressed_path: Path) -> ValidationResult:
result = ValidationResult()
orig = read_file(original_path)
comp = read_file(compressed_path)
validate_headings(orig, comp, result)
validate_code_blocks(orig, comp, result)
validate_urls(orig, comp, result)
validate_paths(orig, comp, result)
validate_bullets(orig, comp, result)
return result
# ---------- CLI ----------
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python validate.py <original> <compressed>")
sys.exit(1)
orig = Path(sys.argv[1]).resolve()
comp = Path(sys.argv[2]).resolve()
res = validate(orig, comp)
print(f"\nValid: {res.is_valid}")
if res.errors:
print("\nErrors:")
for e in res.errors:
print(f" - {e}")
if res.warnings:
print("\nWarnings:")
for w in res.warnings:
print(f" - {w}")