import argparse import math import re from dataclasses import dataclass from pathlib import Path from typing import Dict, Iterable, Iterator, List, Tuple ASCII_RE_TEMPLATE = rb"[ -~]{%d,%d}" UTF16LE_ASCII_RE_TEMPLATE = rb"(?:[ -~]\x00){%d,%d}" @dataclass class Hit: s: str score: float file: Path offset: int kind: str count: int = 1 def iter_files(paths: Iterable[str]) -> Iterator[Path]: for p in paths: path = Path(p) if path.is_dir(): for child in sorted(path.rglob("*")): if child.is_file(): yield child elif path.is_file(): yield path def shannon_entropy(s: str) -> float: if not s: return 0.0 freq: Dict[str, int] = {} for ch in s: freq[ch] = freq.get(ch, 0) + 1 n = len(s) ent = 0.0 for c in freq.values(): p = c / n ent -= p * math.log2(p) return ent BAD_SUBSTRINGS = ( "\\\\", "\\Registry\\", "\\Registry", "\\BaseNamedObjects\\", "\\BaseNamedObjects", ":\\", "/", "System32", "Windows", "Microsoft", "CLSID", "AppX", "shell:::", "atom(", ".dll", ".exe", ".sys", ".ini", ".mui", ".nls", ".png", ".jpg", ".jpeg", ".gif", ".ttf", ".otf", ".wav", ".mp3", ".mp4", ".sqlite", ) def looks_passwordish(s: str) -> bool: if any(ch in s for ch in ('\\', '/', ':', '<', '>', '"', "'", '=', '\t', '\r', '\n')): return False if any(bad in s for bad in BAD_SUBSTRINGS): return False if s.startswith("http://") or s.startswith("https://"): return False if s.count(" ") >= 4: return False if len(set(s)) <= 3: return False if re.fullmatch(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}", s): return False return True def score_string(s: str) -> float: has_lower = any("a" <= c <= "z" for c in s) has_upper = any("A" <= c <= "Z" for c in s) has_digit = any("0" <= c <= "9" for c in s) has_special = any(not c.isalnum() for c in s) ent = shannon_entropy(s) score = ent * len(s) score += 5.0 * has_lower score += 5.0 * has_upper score += 5.0 * has_digit score += 5.0 * has_special if " " in s: score -= 2.0 if s.islower() or s.isupper(): score -= 1.0 if all(c in "0123456789abcdefABCDEF" for c in s): score -= 3.0 return score def extract_hits(data: bytes, *, min_len: int, max_len: int) -> Iterator[Tuple[str, int, str]]: ascii_re = re.compile(ASCII_RE_TEMPLATE % (min_len, max_len)) utf16_re = re.compile(UTF16LE_ASCII_RE_TEMPLATE % (min_len, max_len)) for m in ascii_re.finditer(data): s = m.group(0).decode("ascii", errors="ignore") yield s, m.start(), "ascii" for m in utf16_re.finditer(data): raw = m.group(0) s = raw[::2].decode("ascii", errors="ignore") yield s, m.start(), "utf16le" def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("paths", nargs="+", help="Files/dirs to scan") ap.add_argument("--min-len", type=int, default=8) ap.add_argument("--max-len", type=int, default=64) ap.add_argument("--top", type=int, default=80) ap.add_argument("--grep", type=str, default="", help="Only show hits containing this substring") args = ap.parse_args() best: Dict[str, Hit] = {} grep = args.grep for fp in iter_files(args.paths): if fp.suffix.lower() not in (".dmp", ".mem", ".raw", ".bin", ""): continue try: data = fp.read_bytes() except Exception: continue for s, off, kind in extract_hits(data, min_len=args.min_len, max_len=args.max_len): if grep and grep not in s: continue if not looks_passwordish(s): continue sc = score_string(s) existing = best.get(s) if existing is None: best[s] = Hit(s=s, score=sc, file=fp, offset=off, kind=kind) else: existing.count += 1 if sc > existing.score: existing.score = sc existing.file = fp existing.offset = off existing.kind = kind hits: List[Hit] = sorted(best.values(), key=lambda h: h.score, reverse=True) if not hits: print("[!] No candidates found") return 2 for h in hits[: args.top]: print(f"{h.score:8.2f}\t{h.count:4d}\t{h.kind}\t{h.file}\t0x{h.offset:X}\t{h.s}") return 0 if __name__ == "__main__": raise SystemExit(main())