updates
This commit is contained in:
189
caveman-compress/scripts/validate.py
Normal file
189
caveman-compress/scripts/validate.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
URL_REGEX = re.compile(r"https?://[^\s)]+")
|
||||
FENCE_OPEN_REGEX = re.compile(r"^(\s{0,3})(`{3,}|~{3,})(.*)$")
|
||||
HEADING_REGEX = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
|
||||
BULLET_REGEX = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
|
||||
|
||||
# crude but effective path detection
|
||||
# Requires either a path prefix (./ ../ / or drive letter) or a slash/backslash within the match
|
||||
PATH_REGEX = re.compile(r"(?:\./|\.\./|/|[A-Za-z]:\\)[\w\-/\\\.]+|[\w\-\.]+[/\\][\w\-/\\\.]+")
|
||||
|
||||
|
||||
class ValidationResult:
|
||||
def __init__(self):
|
||||
self.is_valid = True
|
||||
self.errors = []
|
||||
self.warnings = []
|
||||
|
||||
def add_error(self, msg):
|
||||
self.is_valid = False
|
||||
self.errors.append(msg)
|
||||
|
||||
def add_warning(self, msg):
|
||||
self.warnings.append(msg)
|
||||
|
||||
|
||||
def read_file(path: Path) -> str:
|
||||
return path.read_text(errors="ignore")
|
||||
|
||||
|
||||
# ---------- Extractors ----------
|
||||
|
||||
|
||||
def extract_headings(text):
|
||||
return [(level, title.strip()) for level, title in HEADING_REGEX.findall(text)]
|
||||
|
||||
|
||||
def extract_code_blocks(text):
|
||||
"""Line-based fenced code block extractor.
|
||||
|
||||
Handles ``` and ~~~ fences with variable length (CommonMark: closing
|
||||
fence must use same char and be at least as long as opening). Supports
|
||||
nested fences (e.g. an outer 4-backtick block wrapping inner 3-backtick
|
||||
content).
|
||||
"""
|
||||
blocks = []
|
||||
lines = text.split("\n")
|
||||
i = 0
|
||||
n = len(lines)
|
||||
while i < n:
|
||||
m = FENCE_OPEN_REGEX.match(lines[i])
|
||||
if not m:
|
||||
i += 1
|
||||
continue
|
||||
fence_char = m.group(2)[0]
|
||||
fence_len = len(m.group(2))
|
||||
open_line = lines[i]
|
||||
block_lines = [open_line]
|
||||
i += 1
|
||||
closed = False
|
||||
while i < n:
|
||||
close_m = FENCE_OPEN_REGEX.match(lines[i])
|
||||
if (
|
||||
close_m
|
||||
and close_m.group(2)[0] == fence_char
|
||||
and len(close_m.group(2)) >= fence_len
|
||||
and close_m.group(3).strip() == ""
|
||||
):
|
||||
block_lines.append(lines[i])
|
||||
closed = True
|
||||
i += 1
|
||||
break
|
||||
block_lines.append(lines[i])
|
||||
i += 1
|
||||
if closed:
|
||||
blocks.append("\n".join(block_lines))
|
||||
# Unclosed fences are silently skipped — they indicate malformed markdown
|
||||
# and including them would cause false-positive validation failures.
|
||||
return blocks
|
||||
|
||||
|
||||
def extract_urls(text):
|
||||
return set(URL_REGEX.findall(text))
|
||||
|
||||
|
||||
def extract_paths(text):
|
||||
return set(PATH_REGEX.findall(text))
|
||||
|
||||
|
||||
def count_bullets(text):
|
||||
return len(BULLET_REGEX.findall(text))
|
||||
|
||||
|
||||
# ---------- Validators ----------
|
||||
|
||||
|
||||
def validate_headings(orig, comp, result):
|
||||
h1 = extract_headings(orig)
|
||||
h2 = extract_headings(comp)
|
||||
|
||||
if len(h1) != len(h2):
|
||||
result.add_error(f"Heading count mismatch: {len(h1)} vs {len(h2)}")
|
||||
|
||||
if h1 != h2:
|
||||
result.add_warning("Heading text/order changed")
|
||||
|
||||
|
||||
def validate_code_blocks(orig, comp, result):
|
||||
c1 = extract_code_blocks(orig)
|
||||
c2 = extract_code_blocks(comp)
|
||||
|
||||
if c1 != c2:
|
||||
result.add_error("Code blocks not preserved exactly")
|
||||
|
||||
|
||||
def validate_urls(orig, comp, result):
|
||||
u1 = extract_urls(orig)
|
||||
u2 = extract_urls(comp)
|
||||
|
||||
if u1 != u2:
|
||||
result.add_error(f"URL mismatch: lost={u1 - u2}, added={u2 - u1}")
|
||||
|
||||
|
||||
def validate_paths(orig, comp, result):
|
||||
p1 = extract_paths(orig)
|
||||
p2 = extract_paths(comp)
|
||||
|
||||
if p1 != p2:
|
||||
result.add_warning(f"Path mismatch: lost={p1 - p2}, added={p2 - p1}")
|
||||
|
||||
|
||||
def validate_bullets(orig, comp, result):
|
||||
b1 = count_bullets(orig)
|
||||
b2 = count_bullets(comp)
|
||||
|
||||
if b1 == 0:
|
||||
return
|
||||
|
||||
diff = abs(b1 - b2) / b1
|
||||
|
||||
if diff > 0.15:
|
||||
result.add_warning(f"Bullet count changed too much: {b1} -> {b2}")
|
||||
|
||||
|
||||
# ---------- Main ----------
|
||||
|
||||
|
||||
def validate(original_path: Path, compressed_path: Path) -> ValidationResult:
|
||||
result = ValidationResult()
|
||||
|
||||
orig = read_file(original_path)
|
||||
comp = read_file(compressed_path)
|
||||
|
||||
validate_headings(orig, comp, result)
|
||||
validate_code_blocks(orig, comp, result)
|
||||
validate_urls(orig, comp, result)
|
||||
validate_paths(orig, comp, result)
|
||||
validate_bullets(orig, comp, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------- CLI ----------
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python validate.py <original> <compressed>")
|
||||
sys.exit(1)
|
||||
|
||||
orig = Path(sys.argv[1]).resolve()
|
||||
comp = Path(sys.argv[2]).resolve()
|
||||
|
||||
res = validate(orig, comp)
|
||||
|
||||
print(f"\nValid: {res.is_valid}")
|
||||
|
||||
if res.errors:
|
||||
print("\nErrors:")
|
||||
for e in res.errors:
|
||||
print(f" - {e}")
|
||||
|
||||
if res.warnings:
|
||||
print("\nWarnings:")
|
||||
for w in res.warnings:
|
||||
print(f" - {w}")
|
||||
Reference in New Issue
Block a user