Init. commit

2026-03-02 21:44:22 +03:00
commit 9511b38280
38 changed files with 4397 additions and 0 deletions
--- a/HumanAI-Forensic-Hard/scripts/extract_password_candidates.py
+++ b/HumanAI-Forensic-Hard/scripts/extract_password_candidates.py
@@ -0,0 +1,188 @@
+import argparse
+import math
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Tuple
+
+
+ASCII_RE_TEMPLATE = rb"[ -~]{%d,%d}"
+UTF16LE_ASCII_RE_TEMPLATE = rb"(?:[ -~]\x00){%d,%d}"
+
+
+@dataclass
+class Hit:
+    s: str
+    score: float
+    file: Path
+    offset: int
+    kind: str
+    count: int = 1
+
+
+def iter_files(paths: Iterable[str]) -> Iterator[Path]:
+    for p in paths:
+        path = Path(p)
+        if path.is_dir():
+            for child in sorted(path.rglob("*")):
+                if child.is_file():
+                    yield child
+        elif path.is_file():
+            yield path
+
+
+def shannon_entropy(s: str) -> float:
+    if not s:
+        return 0.0
+    freq: Dict[str, int] = {}
+    for ch in s:
+        freq[ch] = freq.get(ch, 0) + 1
+    n = len(s)
+    ent = 0.0
+    for c in freq.values():
+        p = c / n
+        ent -= p * math.log2(p)
+    return ent
+
+
+BAD_SUBSTRINGS = (
+    "\\\\",
+    "\\Registry\\",
+    "\\Registry",
+    "\\BaseNamedObjects\\",
+    "\\BaseNamedObjects",
+    ":\\",
+    "/",
+    "System32",
+    "Windows",
+    "Microsoft",
+    "CLSID",
+    "AppX",
+    "shell:::",
+    "atom(",
+    ".dll",
+    ".exe",
+    ".sys",
+    ".ini",
+    ".mui",
+    ".nls",
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".ttf",
+    ".otf",
+    ".wav",
+    ".mp3",
+    ".mp4",
+    ".sqlite",
+)
+
+
+def looks_passwordish(s: str) -> bool:
+
+    if any(ch in s for ch in ('\\', '/', ':', '<', '>', '"', "'", '=', '\t', '\r', '\n')):
+        return False
+    if any(bad in s for bad in BAD_SUBSTRINGS):
+        return False
+    if s.startswith("http://") or s.startswith("https://"):
+        return False
+
+    if s.count(" ") >= 4:
+        return False
+
+    if len(set(s)) <= 3:
+        return False
+
+    if re.fullmatch(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}", s):
+        return False
+    return True
+
+
+def score_string(s: str) -> float:
+    has_lower = any("a" <= c <= "z" for c in s)
+    has_upper = any("A" <= c <= "Z" for c in s)
+    has_digit = any("0" <= c <= "9" for c in s)
+    has_special = any(not c.isalnum() for c in s)
+
+    ent = shannon_entropy(s)
+    score = ent * len(s)
+    score += 5.0 * has_lower
+    score += 5.0 * has_upper
+    score += 5.0 * has_digit
+    score += 5.0 * has_special
+    if " " in s:
+        score -= 2.0
+    if s.islower() or s.isupper():
+        score -= 1.0
+    if all(c in "0123456789abcdefABCDEF" for c in s):
+        score -= 3.0
+    return score
+
+
+def extract_hits(data: bytes, *, min_len: int, max_len: int) -> Iterator[Tuple[str, int, str]]:
+    ascii_re = re.compile(ASCII_RE_TEMPLATE % (min_len, max_len))
+    utf16_re = re.compile(UTF16LE_ASCII_RE_TEMPLATE % (min_len, max_len))
+
+    for m in ascii_re.finditer(data):
+        s = m.group(0).decode("ascii", errors="ignore")
+        yield s, m.start(), "ascii"
+
+    for m in utf16_re.finditer(data):
+        raw = m.group(0)
+        s = raw[::2].decode("ascii", errors="ignore")
+        yield s, m.start(), "utf16le"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("paths", nargs="+", help="Files/dirs to scan")
+    ap.add_argument("--min-len", type=int, default=8)
+    ap.add_argument("--max-len", type=int, default=64)
+    ap.add_argument("--top", type=int, default=80)
+    ap.add_argument("--grep", type=str, default="", help="Only show hits containing this substring")
+    args = ap.parse_args()
+
+    best: Dict[str, Hit] = {}
+    grep = args.grep
+
+    for fp in iter_files(args.paths):
+
+        if fp.suffix.lower() not in (".dmp", ".mem", ".raw", ".bin", ""):
+            continue
+
+        try:
+            data = fp.read_bytes()
+        except Exception:
+            continue
+
+        for s, off, kind in extract_hits(data, min_len=args.min_len, max_len=args.max_len):
+            if grep and grep not in s:
+                continue
+            if not looks_passwordish(s):
+                continue
+            sc = score_string(s)
+            existing = best.get(s)
+            if existing is None:
+                best[s] = Hit(s=s, score=sc, file=fp, offset=off, kind=kind)
+            else:
+                existing.count += 1
+                if sc > existing.score:
+                    existing.score = sc
+                    existing.file = fp
+                    existing.offset = off
+                    existing.kind = kind
+
+    hits: List[Hit] = sorted(best.values(), key=lambda h: h.score, reverse=True)
+    if not hits:
+        print("[!] No candidates found")
+        return 2
+
+    for h in hits[: args.top]:
+        print(f"{h.score:8.2f}\t{h.count:4d}\t{h.kind}\t{h.file}\t0x{h.offset:X}\t{h.s}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())