#!/usr/bin/env python3
"""
extrair_strings.py
Extrai frases em Português visíveis do arquivo PHP/HTML indicado em SOURCE_FILE
e gera:
  • checkin-bhb.strings_pt.txt   – lista das frases (uma por linha)
  • checkin-bhb.dict_stub.php    – stub para colar no $dict['pt']
Dependências:
  pip install beautifulsoup4 langdetect   # langdetect opcional; defina USE_LANGDETECT=False p/ dispensar
"""

import re, unicodedata, pathlib
from bs4 import BeautifulSoup            # pip install beautifulsoup4
USE_LANGDETECT = True                    # mude para False se não quiser instalar langdetect
if USE_LANGDETECT:
    try:
        from langdetect import detect
    except ImportError:
        print("langdetect ausente – continuarei sem filtrar idioma")
        USE_LANGDETECT = False

# —————————————————————————————————————————————————————————
SOURCE_FILE = pathlib.Path(
    r"C:\xampp\htdocs\BI-HITS\dados\checkin-bhb.php"   # << seu arquivo aqui
)
# —————————————————————————————————————————————————————————

def slugify(txt, maxlen=40):
    txt = unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode()
    txt = re.sub(r"[^a-z0-9]+", "_", txt.lower()).strip("_")
    return txt[:maxlen] or "texto"

def is_pt(txt):
    if not USE_LANGDETECT:
        return True
    try:
        return detect(txt) == "pt"
    except Exception:
        return False

html_raw = SOURCE_FILE.read_text(encoding="utf-8", errors="ignore")
html_clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", " ", html_raw, flags=re.S|re.I)
html_clean = re.sub(r"<\?php[\s\S]*?\?>", " ", html_clean, flags=re.S)

texts = BeautifulSoup(html_clean, "html.parser").stripped_strings
frases = {
    t for t in texts
    if len(t) > 2 and not re.search(r"{{|}}|%\w", t) and is_pt(t)
}

# Arquivos de saída ao lado do PHP
txt_out  = SOURCE_FILE.with_suffix(".strings_pt.txt")
php_stub = SOURCE_FILE.with_suffix(".dict_stub.php")

txt_out.write_text("\n".join(sorted(frases, key=str.lower)), encoding="utf-8")
stub_lines = [f"    '{pt}' => '{slugify(pt)}'," for pt in sorted(frases, key=str.lower)]
php_stub.write_text("        // ——— Auto-extraído ———\n" + "\n".join(stub_lines), encoding="utf-8")

print(f"✔️  {len(frases)} frases salvas em {txt_out.name}")
print(f"✔️  Stub PHP criado em {php_stub.name}")