From 35f5f2674aae66279efa93c26c5dc8961769520f Mon Sep 17 00:00:00 2001 From: Marco D'Aleo Date: Sat, 29 Nov 2025 10:02:45 +0000 Subject: [PATCH] Major rewrite of junk.py, adding user config file for custom rules, don't treat broken symlink as junk --- src/filedust/cli.py | 6 +- src/filedust/junk.py | 197 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 183 insertions(+), 20 deletions(-) diff --git a/src/filedust/cli.py b/src/filedust/cli.py index 42077f3..f550f06 100644 --- a/src/filedust/cli.py +++ b/src/filedust/cli.py @@ -12,7 +12,7 @@ from rich.table import Table from rich.prompt import Confirm from rich import box -from .junk import Finding, iter_junk +from .junk import Finding, iter_junk, load_user_rules console = Console() @@ -168,6 +168,7 @@ def delete_all(findings: List[Finding]) -> int: def main(argv: list[str] | None = None) -> int: + print("Looking for junk ...") parser = build_parser() args = parser.parse_args(argv) @@ -196,7 +197,8 @@ def main(argv: list[str] | None = None) -> int: "This may take a while and may require sudo for deletions.[/]" ) - findings = list(iter_junk(root)) + rules = load_user_rules() + findings = list(iter_junk(root, rules=rules)) total_size = compute_total_size(findings) if not findings: diff --git a/src/filedust/junk.py b/src/filedust/junk.py index 54c6543..19f572a 100644 --- a/src/filedust/junk.py +++ b/src/filedust/junk.py @@ -1,12 +1,41 @@ from __future__ import annotations import os +import configparser from dataclasses import dataclass from fnmatch import fnmatch from pathlib import Path from typing import Iterable, List +class UserRules: + def __init__(self): + self.include: list[str] = [] + self.exclude: list[str] = [] + + +def load_user_rules() -> UserRules: + rules = UserRules() + cfg_path = Path.home() / ".filedust.conf" + + if cfg_path.exists(): + parser = configparser.ConfigParser(allow_no_value=True) + parser.read(cfg_path) + + if parser.has_section("include"): + rules.include = list(parser["include"].keys()) + + if parser.has_section("exclude"): + rules.exclude = list(parser["exclude"].keys()) + + return rules + + +def matches_any(patterns: list[str], relpath: Path) -> bool: + posix = relpath.as_posix() + return any(fnmatch(posix, p) for p in patterns) + + @dataclass class Finding: path: Path @@ -53,6 +82,7 @@ JUNK_FILE_PATTERNS = [ SKIP_DIR_NAMES = { ".cache", "build", + ".gnupg", ".git", ".hg", ".svn", @@ -62,6 +92,34 @@ SKIP_DIR_NAMES = { } +HOME = Path.home().resolve() + + +def safe_exists(path: Path) -> bool | None: + """Return True/False if the path exists, or None if permission denied.""" + try: + return path.exists() + except Exception: + return None + + +def safe_resolve(path: Path, root: Path) -> Path | None: + """ + Resolve symlinks only if safe. + Return resolved path if it stays within root. + Return None if: + - resolution escapes the root + - resolution fails + - permission denied + """ + try: + resolved = path.resolve(strict=False) # NEVER strict + resolved.relative_to(root) # ensure containment + return resolved + except Exception: + return None + + def is_junk_dir_name(name: str) -> bool: return name in JUNK_DIR_NAMES @@ -70,37 +128,140 @@ def is_junk_file_name(name: str) -> bool: return any(fnmatch(name, pattern) for pattern in JUNK_FILE_PATTERNS) -def iter_junk(root: Path) -> Iterable[Finding]: +def iter_junk(root: Path, rules: UserRules | None = None) -> Iterable[Finding]: """ - Walk the tree under `root` and yield junk candidates. + Safe, fast junk scanner: + - Never follows symlinks. + - Broken symlinks are not automatically junk — they follow normal rules. + - User include/exclude overrides all. + - Built-in junk rules applied only when safe. + - SKIP_DIR_NAMES protected unless user includes. + - Fully contained in $HOME. + - No crashes from PermissionError or unreadable paths. + """ + if rules is None: + rules = UserRules() - filedust: - - Skips known critical / config directories (SKIP_DIR_NAMES). - - Treats known "junk" directory names as removable as a whole. - - Treats known junk file patterns as removable. - """ root = root.resolve() + root_str = str(root) - for dirpath, dirnames, filenames in os.walk(root): + for dirpath, dirnames, filenames in os.walk(root, followlinks=False): dirpath_p = Path(dirpath) - # Prune dirs we never touch at all. - dirnames[:] = [d for d in dirnames if d not in SKIP_DIR_NAMES] + # Fast relative path computation + if dirpath == root_str: + rel_dir = Path(".") + else: + rel_dir = Path(dirpath[len(root_str) :].lstrip("/")) - # Detect junk directories (and skip walking inside them). + # USER EXCLUDE → skip entire subtree + if matches_any(rules.exclude, rel_dir): + dirnames[:] = [] + continue + + pruned = [] + + # Handling dirs + for d in dirnames: + child = dirpath_p / d + + try: + st = child.lstat() + except Exception: + continue # unreadable + + is_symlink = (st.st_mode & 0o170000) == 0o120000 + + if is_symlink: + # If broken symlink dir treat as file later via filenames (skip descent) + continue + + rel_child = rel_dir / d + + # User exclude wins + if matches_any(rules.exclude, rel_child): + continue + + # SKIP_DIR_NAMES unless user includes + if d in SKIP_DIR_NAMES and not matches_any( + rules.include, rel_child + ): + continue + + pruned.append(d) + + dirnames[:] = pruned + + # Detect JUNK dirs i = 0 while i < len(dirnames): name = dirnames[i] - if is_junk_dir_name(name): - junk_dir = dirpath_p / name - yield Finding(path=junk_dir, kind="dir", reason="junk_dir") - # Remove from walk so we don't descend into it. + rel_child = rel_dir / name + + # User include directory + if matches_any(rules.include, rel_child): + yield Finding(dirpath_p / name, "dir", "user_include") del dirnames[i] continue + + # Built-in safe junk dirs + if is_junk_dir_name(name): + yield Finding(dirpath_p / name, "dir", "junk_dir") + del dirnames[i] + continue + i += 1 - # Now process files. + # Handling files (including symlinks) for fname in filenames: + fpath = dirpath_p / fname + rel_file = rel_dir / fname + + try: + st = fpath.lstat() + except Exception: + continue + + is_symlink = (st.st_mode & 0o170000) == 0o120000 + + # Handling broken symlinks + if is_symlink: + exists = safe_exists(fpath) + + # Permission denied → skip + if exists is None: + continue + + # User exclude wins + if matches_any(rules.exclude, rel_file): + continue + + # User include wins + if matches_any(rules.include, rel_file): + yield Finding(fpath, "file", "user_include") + continue + + # Broken symlink? + if exists is False: + # DO NOT auto-delete — classify like regular file + # Only built-in junk patterns apply + if is_junk_file_name(fname): + yield Finding(fpath, "file", "broken_symlink") + continue + + # Valid symlink — NEVER follow; only user-include counts + continue + + # Regular files + # User exclude wins + if matches_any(rules.exclude, rel_file): + continue + + # User include wins + if matches_any(rules.include, rel_file): + yield Finding(fpath, "file", "user_include") + continue + + # Built-in junk patterns (safe ones) if is_junk_file_name(fname): - fpath = dirpath_p / fname - yield Finding(path=fpath, kind="file", reason="junk_file") + yield Finding(fpath, "file", "junk_file")