122 lines
3.9 KiB
Python
122 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Detect whether a file is natural language (compressible) or code/config (skip)."""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
# Extensions that are natural language and compressible
|
|
COMPRESSIBLE_EXTENSIONS = {".md", ".txt", ".markdown", ".rst"}
|
|
|
|
# Extensions that are code/config and should be skipped
|
|
SKIP_EXTENSIONS = {
|
|
".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
|
|
".toml", ".env", ".lock", ".css", ".scss", ".html", ".xml",
|
|
".sql", ".sh", ".bash", ".zsh", ".go", ".rs", ".java", ".c",
|
|
".cpp", ".h", ".hpp", ".rb", ".php", ".swift", ".kt", ".lua",
|
|
".dockerfile", ".makefile", ".csv", ".ini", ".cfg",
|
|
}
|
|
|
|
# Patterns that indicate a line is code
|
|
CODE_PATTERNS = [
|
|
re.compile(r"^\s*(import |from .+ import |require\(|const |let |var )"),
|
|
re.compile(r"^\s*(def |class |function |async function |export )"),
|
|
re.compile(r"^\s*(if\s*\(|for\s*\(|while\s*\(|switch\s*\(|try\s*\{)"),
|
|
re.compile(r"^\s*[\}\]\);]+\s*$"), # closing braces/brackets
|
|
re.compile(r"^\s*@\w+"), # decorators/annotations
|
|
re.compile(r'^\s*"[^"]+"\s*:\s*'), # JSON-like key-value
|
|
re.compile(r"^\s*\w+\s*=\s*[{\[\(\"']"), # assignment with literal
|
|
]
|
|
|
|
|
|
def _is_code_line(line: str) -> bool:
|
|
"""Check if a line looks like code."""
|
|
return any(p.match(line) for p in CODE_PATTERNS)
|
|
|
|
|
|
def _is_json_content(text: str) -> bool:
|
|
"""Check if content is valid JSON."""
|
|
try:
|
|
json.loads(text)
|
|
return True
|
|
except (json.JSONDecodeError, ValueError):
|
|
return False
|
|
|
|
|
|
def _is_yaml_content(lines: list[str]) -> bool:
|
|
"""Heuristic: check if content looks like YAML."""
|
|
yaml_indicators = 0
|
|
for line in lines[:30]:
|
|
stripped = line.strip()
|
|
if stripped.startswith("---"):
|
|
yaml_indicators += 1
|
|
elif re.match(r"^\w[\w\s]*:\s", stripped):
|
|
yaml_indicators += 1
|
|
elif stripped.startswith("- ") and ":" in stripped:
|
|
yaml_indicators += 1
|
|
# If most non-empty lines look like YAML
|
|
non_empty = sum(1 for l in lines[:30] if l.strip())
|
|
return non_empty > 0 and yaml_indicators / non_empty > 0.6
|
|
|
|
|
|
def detect_file_type(filepath: Path) -> str:
|
|
"""Classify a file as 'natural_language', 'code', 'config', or 'unknown'.
|
|
|
|
Returns:
|
|
One of: 'natural_language', 'code', 'config', 'unknown'
|
|
"""
|
|
ext = filepath.suffix.lower()
|
|
|
|
# Extension-based classification
|
|
if ext in COMPRESSIBLE_EXTENSIONS:
|
|
return "natural_language"
|
|
if ext in SKIP_EXTENSIONS:
|
|
return "code" if ext not in {".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".env"} else "config"
|
|
|
|
# Extensionless files (like CLAUDE.md, TODO) — check content
|
|
if not ext:
|
|
try:
|
|
text = filepath.read_text(errors="ignore")
|
|
except (OSError, PermissionError):
|
|
return "unknown"
|
|
|
|
lines = text.splitlines()[:50]
|
|
|
|
if _is_json_content(text[:10000]):
|
|
return "config"
|
|
if _is_yaml_content(lines):
|
|
return "config"
|
|
|
|
code_lines = sum(1 for l in lines if l.strip() and _is_code_line(l))
|
|
non_empty = sum(1 for l in lines if l.strip())
|
|
if non_empty > 0 and code_lines / non_empty > 0.4:
|
|
return "code"
|
|
|
|
return "natural_language"
|
|
|
|
return "unknown"
|
|
|
|
|
|
def should_compress(filepath: Path) -> bool:
|
|
"""Return True if the file is natural language and should be compressed."""
|
|
if not filepath.is_file():
|
|
return False
|
|
# Skip backup files
|
|
if filepath.name.endswith(".original.md"):
|
|
return False
|
|
return detect_file_type(filepath) == "natural_language"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python detect.py <file1> [file2] ...")
|
|
sys.exit(1)
|
|
|
|
for path_str in sys.argv[1:]:
|
|
p = Path(path_str).resolve()
|
|
file_type = detect_file_type(p)
|
|
compress = should_compress(p)
|
|
print(f" {p.name:30s} type={file_type:20s} compress={compress}")
|