Major rewrite of junk.py, adding user config file for custom rules, don't treat broken symlink as junk

This commit is contained in:
2025-11-29 10:02:45 +00:00
parent c75a5246e3
commit 35f5f2674a
2 changed files with 183 additions and 20 deletions

View File

@@ -12,7 +12,7 @@ from rich.table import Table
from rich.prompt import Confirm
from rich import box
from .junk import Finding, iter_junk
from .junk import Finding, iter_junk, load_user_rules
console = Console()
@@ -168,6 +168,7 @@ def delete_all(findings: List[Finding]) -> int:
def main(argv: list[str] | None = None) -> int:
print("Looking for junk ...")
parser = build_parser()
args = parser.parse_args(argv)
@@ -196,7 +197,8 @@ def main(argv: list[str] | None = None) -> int:
"This may take a while and may require sudo for deletions.[/]"
)
findings = list(iter_junk(root))
rules = load_user_rules()
findings = list(iter_junk(root, rules=rules))
total_size = compute_total_size(findings)
if not findings:

View File

@@ -1,12 +1,41 @@
from __future__ import annotations
import os
import configparser
from dataclasses import dataclass
from fnmatch import fnmatch
from pathlib import Path
from typing import Iterable, List
class UserRules:
def __init__(self):
self.include: list[str] = []
self.exclude: list[str] = []
def load_user_rules() -> UserRules:
rules = UserRules()
cfg_path = Path.home() / ".filedust.conf"
if cfg_path.exists():
parser = configparser.ConfigParser(allow_no_value=True)
parser.read(cfg_path)
if parser.has_section("include"):
rules.include = list(parser["include"].keys())
if parser.has_section("exclude"):
rules.exclude = list(parser["exclude"].keys())
return rules
def matches_any(patterns: list[str], relpath: Path) -> bool:
posix = relpath.as_posix()
return any(fnmatch(posix, p) for p in patterns)
@dataclass
class Finding:
path: Path
@@ -53,6 +82,7 @@ JUNK_FILE_PATTERNS = [
SKIP_DIR_NAMES = {
".cache",
"build",
".gnupg",
".git",
".hg",
".svn",
@@ -62,6 +92,34 @@ SKIP_DIR_NAMES = {
}
HOME = Path.home().resolve()
def safe_exists(path: Path) -> bool | None:
"""Return True/False if the path exists, or None if permission denied."""
try:
return path.exists()
except Exception:
return None
def safe_resolve(path: Path, root: Path) -> Path | None:
"""
Resolve symlinks only if safe.
Return resolved path if it stays within root.
Return None if:
- resolution escapes the root
- resolution fails
- permission denied
"""
try:
resolved = path.resolve(strict=False) # NEVER strict
resolved.relative_to(root) # ensure containment
return resolved
except Exception:
return None
def is_junk_dir_name(name: str) -> bool:
return name in JUNK_DIR_NAMES
@@ -70,37 +128,140 @@ def is_junk_file_name(name: str) -> bool:
return any(fnmatch(name, pattern) for pattern in JUNK_FILE_PATTERNS)
def iter_junk(root: Path) -> Iterable[Finding]:
def iter_junk(root: Path, rules: UserRules | None = None) -> Iterable[Finding]:
"""
Walk the tree under `root` and yield junk candidates.
Safe, fast junk scanner:
- Never follows symlinks.
- Broken symlinks are not automatically junk — they follow normal rules.
- User include/exclude overrides all.
- Built-in junk rules applied only when safe.
- SKIP_DIR_NAMES protected unless user includes.
- Fully contained in $HOME.
- No crashes from PermissionError or unreadable paths.
"""
if rules is None:
rules = UserRules()
filedust:
- Skips known critical / config directories (SKIP_DIR_NAMES).
- Treats known "junk" directory names as removable as a whole.
- Treats known junk file patterns as removable.
"""
root = root.resolve()
root_str = str(root)
for dirpath, dirnames, filenames in os.walk(root):
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
dirpath_p = Path(dirpath)
# Prune dirs we never touch at all.
dirnames[:] = [d for d in dirnames if d not in SKIP_DIR_NAMES]
# Fast relative path computation
if dirpath == root_str:
rel_dir = Path(".")
else:
rel_dir = Path(dirpath[len(root_str) :].lstrip("/"))
# Detect junk directories (and skip walking inside them).
# USER EXCLUDE → skip entire subtree
if matches_any(rules.exclude, rel_dir):
dirnames[:] = []
continue
pruned = []
# Handling dirs
for d in dirnames:
child = dirpath_p / d
try:
st = child.lstat()
except Exception:
continue # unreadable
is_symlink = (st.st_mode & 0o170000) == 0o120000
if is_symlink:
# If broken symlink dir treat as file later via filenames (skip descent)
continue
rel_child = rel_dir / d
# User exclude wins
if matches_any(rules.exclude, rel_child):
continue
# SKIP_DIR_NAMES unless user includes
if d in SKIP_DIR_NAMES and not matches_any(
rules.include, rel_child
):
continue
pruned.append(d)
dirnames[:] = pruned
# Detect JUNK dirs
i = 0
while i < len(dirnames):
name = dirnames[i]
if is_junk_dir_name(name):
junk_dir = dirpath_p / name
yield Finding(path=junk_dir, kind="dir", reason="junk_dir")
# Remove from walk so we don't descend into it.
rel_child = rel_dir / name
# User include directory
if matches_any(rules.include, rel_child):
yield Finding(dirpath_p / name, "dir", "user_include")
del dirnames[i]
continue
# Built-in safe junk dirs
if is_junk_dir_name(name):
yield Finding(dirpath_p / name, "dir", "junk_dir")
del dirnames[i]
continue
i += 1
# Now process files.
# Handling files (including symlinks)
for fname in filenames:
fpath = dirpath_p / fname
rel_file = rel_dir / fname
try:
st = fpath.lstat()
except Exception:
continue
is_symlink = (st.st_mode & 0o170000) == 0o120000
# Handling broken symlinks
if is_symlink:
exists = safe_exists(fpath)
# Permission denied → skip
if exists is None:
continue
# User exclude wins
if matches_any(rules.exclude, rel_file):
continue
# User include wins
if matches_any(rules.include, rel_file):
yield Finding(fpath, "file", "user_include")
continue
# Broken symlink?
if exists is False:
# DO NOT auto-delete — classify like regular file
# Only built-in junk patterns apply
if is_junk_file_name(fname):
yield Finding(fpath, "file", "broken_symlink")
continue
# Valid symlink — NEVER follow; only user-include counts
continue
# Regular files
# User exclude wins
if matches_any(rules.exclude, rel_file):
continue
# User include wins
if matches_any(rules.include, rel_file):
yield Finding(fpath, "file", "user_include")
continue
# Built-in junk patterns (safe ones)
if is_junk_file_name(fname):
fpath = dirpath_p / fname
yield Finding(path=fpath, kind="file", reason="junk_file")
yield Finding(fpath, "file", "junk_file")