import argparse
import math
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Tuple


ASCII_RE_TEMPLATE = rb"[ -~]{%d,%d}"
UTF16LE_ASCII_RE_TEMPLATE = rb"(?:[ -~]\x00){%d,%d}"


@dataclass
class Hit:
    s: str
    score: float
    file: Path
    offset: int
    kind: str
    count: int = 1


def iter_files(paths: Iterable[str]) -> Iterator[Path]:
    for p in paths:
        path = Path(p)
        if path.is_dir():
            for child in sorted(path.rglob("*")):
                if child.is_file():
                    yield child
        elif path.is_file():
            yield path


def shannon_entropy(s: str) -> float:
    if not s:
        return 0.0
    freq: Dict[str, int] = {}
    for ch in s:
        freq[ch] = freq.get(ch, 0) + 1
    n = len(s)
    ent = 0.0
    for c in freq.values():
        p = c / n
        ent -= p * math.log2(p)
    return ent


BAD_SUBSTRINGS = (
    "\\\\",
    "\\Registry\\",
    "\\Registry",
    "\\BaseNamedObjects\\",
    "\\BaseNamedObjects",
    ":\\",
    "/",
    "System32",
    "Windows",
    "Microsoft",
    "CLSID",
    "AppX",
    "shell:::",
    "atom(",
    ".dll",
    ".exe",
    ".sys",
    ".ini",
    ".mui",
    ".nls",
    ".png",
    ".jpg",
    ".jpeg",
    ".gif",
    ".ttf",
    ".otf",
    ".wav",
    ".mp3",
    ".mp4",
    ".sqlite",
)


def looks_passwordish(s: str) -> bool:

    if any(ch in s for ch in ('\\', '/', ':', '<', '>', '"', "'", '=', '\t', '\r', '\n')):
        return False
    if any(bad in s for bad in BAD_SUBSTRINGS):
        return False
    if s.startswith("http://") or s.startswith("https://"):
        return False

    if s.count(" ") >= 4:
        return False

    if len(set(s)) <= 3:
        return False

    if re.fullmatch(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}", s):
        return False
    return True


def score_string(s: str) -> float:
    has_lower = any("a" <= c <= "z" for c in s)
    has_upper = any("A" <= c <= "Z" for c in s)
    has_digit = any("0" <= c <= "9" for c in s)
    has_special = any(not c.isalnum() for c in s)

    ent = shannon_entropy(s)
    score = ent * len(s)
    score += 5.0 * has_lower
    score += 5.0 * has_upper
    score += 5.0 * has_digit
    score += 5.0 * has_special
    if " " in s:
        score -= 2.0
    if s.islower() or s.isupper():
        score -= 1.0
    if all(c in "0123456789abcdefABCDEF" for c in s):
        score -= 3.0
    return score


def extract_hits(data: bytes, *, min_len: int, max_len: int) -> Iterator[Tuple[str, int, str]]:
    ascii_re = re.compile(ASCII_RE_TEMPLATE % (min_len, max_len))
    utf16_re = re.compile(UTF16LE_ASCII_RE_TEMPLATE % (min_len, max_len))

    for m in ascii_re.finditer(data):
        s = m.group(0).decode("ascii", errors="ignore")
        yield s, m.start(), "ascii"

    for m in utf16_re.finditer(data):
        raw = m.group(0)
        s = raw[::2].decode("ascii", errors="ignore")
        yield s, m.start(), "utf16le"


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("paths", nargs="+", help="Files/dirs to scan")
    ap.add_argument("--min-len", type=int, default=8)
    ap.add_argument("--max-len", type=int, default=64)
    ap.add_argument("--top", type=int, default=80)
    ap.add_argument("--grep", type=str, default="", help="Only show hits containing this substring")
    args = ap.parse_args()

    best: Dict[str, Hit] = {}
    grep = args.grep

    for fp in iter_files(args.paths):

        if fp.suffix.lower() not in (".dmp", ".mem", ".raw", ".bin", ""):
            continue

        try:
            data = fp.read_bytes()
        except Exception:
            continue

        for s, off, kind in extract_hits(data, min_len=args.min_len, max_len=args.max_len):
            if grep and grep not in s:
                continue
            if not looks_passwordish(s):
                continue
            sc = score_string(s)
            existing = best.get(s)
            if existing is None:
                best[s] = Hit(s=s, score=sc, file=fp, offset=off, kind=kind)
            else:
                existing.count += 1
                if sc > existing.score:
                    existing.score = sc
                    existing.file = fp
                    existing.offset = off
                    existing.kind = kind

    hits: List[Hit] = sorted(best.values(), key=lambda h: h.score, reverse=True)
    if not hits:
        print("[!] No candidates found")
        return 2

    for h in hits[: args.top]:
        print(f"{h.score:8.2f}\t{h.count:4d}\t{h.kind}\t{h.file}\t0x{h.offset:X}\t{h.s}")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())