#!/usr/bin/env python3
"""
Qoo10 Japan Review Crawler — Google Colab 버전 v3.0
(병렬 배치 크롤링 + deep-translator 고속 번역)

사용법: Google Colab에서 셀 4개를 순서대로 실행
  셀 1: 설치 (playwright + deep-translator)
  셀 2: 크롤링 (병렬 배치)
  셀 3: 번역 (deep-translator 병렬 → translations.json)
  셀 4: zip 다운로드
"""

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 셀 1: 설치
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CELL_1 = """
!pip install playwright tqdm deep-translator -q
!playwright install chromium
!playwright install-deps chromium
print("✅ 설치 완료! (Playwright + deep-translator)")
"""

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 셀 2: 크롤러 (병렬 배치 버전)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CELL_2 = """
# ★ 여기만 바꾸세요 ★
PRODUCT_URL = "https://www.qoo10.jp/g/1102765780"
BATCH_SIZE  = 10   # 동시 요청 수 (막히면 5로 줄이고, 여유있으면 20까지)

import re, asyncio, json, time, random, logging
from pathlib import Path
from tqdm.notebook import tqdm
from playwright.async_api import async_playwright

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

def extract_code(url):
    for pat in [r'/(\\d{8,12})(?:\\?|$|#)', r'/(\\d{8,12})$']:
        m = re.search(pat, url.strip().rstrip('/'))
        if m: return m.group(1)
    if re.match(r'^\\d+$', url.strip()): return url.strip()
    return None

GOODS_CODE = extract_code(PRODUCT_URL)
if not GOODS_CODE:
    raise ValueError(f"상품코드 추출 실패: {PRODUCT_URL}")
print(f"🔍 상품코드: {GOODS_CODE}")
print(f"🔗 URL: {PRODUCT_URL}")

OUTPUT_DIR = Path(f"output/{GOODS_CODE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

REVIEW_AJAX = "https://www.qoo10.jp/gmkt.inc/Goods/GoodsReviewAjaxAppend.aspx"
UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0.0.0 Safari/537.36"
)

async def setup_session(page):
    product_name = f"상품_{GOODS_CODE}"
    try:
        await page.goto(PRODUCT_URL, wait_until="domcontentloaded", timeout=30000)
        await page.wait_for_timeout(3000)
        try:
            product_name = (
                await page.locator("h1, .goods_name").first.inner_text()
            ).strip()[:80]
        except: pass
        await page.locator("li:has-text('レビュー')").first.click()
        await page.wait_for_timeout(2000)
        total_count = int(await page.eval_on_selector("#total_count", "el => el.value"))
        page_size   = int(await page.eval_on_selector("#page_size",   "el => el.value"))
        total_pages = (total_count + page_size - 1) // page_size
        logger.info(f"상품명: {product_name} | 총 리뷰: {total_count:,}개 | {total_pages:,}페이지")
        return product_name, total_count, page_size, total_pages
    except Exception as e:
        logger.error(f"세션 초기화 실패: {e}")
        return product_name, 0, 10, 0

def build_url(pn):
    return (
        f"{REVIEW_AJAX}?gd_no={GOODS_CODE}&group_code=2&page_no={pn}"
        f"&page_size=10&sort_type=P&contents_cnt=0"
        f"&___cache_expire___={int(time.time()*1000)+pn}"
    )

async def fetch_batch(page, page_numbers):
    urls = [build_url(pn) for pn in page_numbers]
    try:
        return await page.evaluate(
            \"\"\"async (urls) => Promise.all(urls.map(async url => {
                try {
                    const r = await fetch(url, {credentials: 'include'});
                    return await r.text();
                } catch { return ''; }
            }))\"\"\",
            urls,
        )
    except Exception as e:
        logger.warning(f"배치 실패: {e}")
        return [""] * len(page_numbers)

async def fetch_single(page, pn):
    try:
        html = await page.evaluate(
            \"\"\"async url => {
                const r = await fetch(url, {credentials: 'include'});
                return await r.text();
            }\"\"\",
            build_url(pn),
        )
        return html if html and "<li" in html else ""
    except Exception as e:
        logger.warning(f"페이지 {pn} 실패: {e}")
        return ""

async def run_crawler():
    t0 = time.time()
    name, total_count, page_size, total_pages = f"상품_{GOODS_CODE}", 0, 10, 0
    browser = None
    async with async_playwright() as p:
        try:
            browser = await p.chromium.launch(headless=True)
            ctx = await browser.new_context(user_agent=UA, locale="ja-JP")
            page = await ctx.new_page()
            name, total_count, page_size, total_pages = await setup_session(page)
            if total_pages == 0:
                logger.error("❌ 리뷰를 찾을 수 없습니다."); return
            ok, fail = 0, 0
            batches = [
                list(range(i, min(i + BATCH_SIZE, total_pages + 1)))
                for i in range(1, total_pages + 1, BATCH_SIZE)
            ]
            pbar = tqdm(batches, desc="📥 크롤링", unit="batch")
            for batch in pbar:
                results = await fetch_batch(page, batch)
                retry_pages = []
                for pn, html in zip(batch, results):
                    if html and "<li" in html:
                        (OUTPUT_DIR / f"p{pn}.html").write_text(html, encoding="utf-8")
                        ok += 1
                    else:
                        retry_pages.append(pn)
                for pn in retry_pages:
                    saved = False
                    for attempt in range(3):
                        await asyncio.sleep((attempt + 1) * 1.5)
                        html = await fetch_single(page, pn)
                        if html:
                            (OUTPUT_DIR / f"p{pn}.html").write_text(html, encoding="utf-8")
                            ok += 1; saved = True; break
                    if not saved:
                        fail += 1; logger.error(f"페이지 {pn} 최종 실패")
                await asyncio.sleep(random.uniform(0.2, 0.5))
                pbar.set_postfix({"성공": ok, "실패": fail})
                if ok % 500 < BATCH_SIZE:
                    await page.goto(PRODUCT_URL, wait_until="domcontentloaded", timeout=30000)
                    await page.wait_for_timeout(1500)
        finally:
            if browser: await browser.close()
    meta = {
        "goods_code": GOODS_CODE, "product_name": name, "product_url": PRODUCT_URL,
        "total_reviews": total_count, "total_pages": total_pages,
        "page_size": page_size, "crawled_pages": ok, "failed_pages": fail,
        "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    (OUTPUT_DIR / "meta.json").write_text(
        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    elapsed = time.time() - t0
    print(f"\\n✅ 완료! {ok:,}/{total_pages:,}페이지 | {elapsed/60:.1f}분 소요")

await run_crawler()
"""

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 셀 3: 리뷰 텍스트 한국어 번역 (고속 v3)
# deep-translator + 병렬 처리
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CELL_3 = """
import subprocess, sys, json, time, re
from pathlib import Path
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

# ── 패키지 자동 설치 ──
for pkg in ["deep-translator"]:
    try: __import__(pkg.replace("-","_"))
    except: subprocess.check_call([sys.executable,"-m","pip","install",pkg,"-q"])
from deep_translator import GoogleTranslator

OUTPUT_DIR = Path(f"output/{GOODS_CODE}")

# ── 1) 크롤링 HTML에서 리뷰 텍스트 추출 ──
class ReviewExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self._in_txt = False
        self.texts = []
        self._buf = ""
    def handle_starttag(self, tag, attrs):
        cls = dict(attrs).get("class", "")
        if "review_txt" in cls:
            self._in_txt = True; self._buf = ""
    def handle_data(self, data):
        if self._in_txt: self._buf += data
    def handle_endtag(self, tag):
        if self._in_txt and tag in ("p", "div", "span"):
            t = self._buf.strip()
            if t: self.texts.append(t)
            self._in_txt = False

all_texts = []
html_files = sorted(OUTPUT_DIR.glob("p*.html"),
                     key=lambda f: int(re.sub(r'\\D','',f.stem) or 0))
print(f"📄 {len(html_files)}개 HTML 파일에서 리뷰 추출 중...")
for f in html_files:
    ext = ReviewExtractor()
    ext.feed(f.read_text(encoding="utf-8", errors="ignore"))
    all_texts.extend(ext.texts)

seen = set()
unique_texts = []
for t in all_texts:
    if t not in seen:
        seen.add(t); unique_texts.append(t)
print(f"✅ 고유 리뷰 {len(unique_texts):,}건 추출 완료")

# ── 2) 고속 배치 번역 ──
SEP = "\\n〰〰〰\\n"            # 번역 시 유지되는 안전한 구분자
BATCH_CHARS = 4800            # Google 한도 ~5000자
WORKERS = 4                   # 동시 번역 스레드 수

# 배치 구성 (글자수 기준)
batches = []
buf_texts = []
buf_len = 0
for t in unique_texts:
    add_len = len(t) + len(SEP)
    if buf_len + add_len > BATCH_CHARS and buf_texts:
        batches.append(buf_texts[:])
        buf_texts = []; buf_len = 0
    buf_texts.append(t)
    buf_len += add_len
if buf_texts:
    batches.append(buf_texts[:])

avg_per_batch = len(unique_texts) // max(len(batches), 1)
print(f"🚀 {len(unique_texts):,}건 → {len(batches):,}배치 "
      f"(~{avg_per_batch}건/배치, {WORKERS}스레드 병렬)")

translations = {}
errors = 0

def translate_batch(batch_texts):
    joined = SEP.join(batch_texts)
    for attempt in range(3):
        try:
            tr = GoogleTranslator(source='ja', target='ko')
            result = tr.translate(joined)
            if not result:
                raise ValueError("빈 응답")
            parts = result.split("〰〰〰")
            if len(parts) < len(batch_texts):
                parts = result.split("~ ~ ~")
            if len(parts) < len(batch_texts):
                parts = result.split("~~~")
            local = {}
            for j, txt in enumerate(batch_texts):
                local[txt] = parts[j].strip() if j < len(parts) else ""
            return local, 0
        except Exception as e:
            if attempt < 2:
                time.sleep(1.5 * (attempt + 1))
            else:
                local = {}
                errs = 0
                for txt in batch_texts:
                    try:
                        tr = GoogleTranslator(source='ja', target='ko')
                        local[txt] = tr.translate(txt) or ""
                        time.sleep(0.1)
                    except:
                        local[txt] = ""
                        errs += 1
                return local, errs
    return {t: "" for t in batch_texts}, len(batch_texts)

# 병렬 실행
pbar = tqdm(total=len(batches), desc="🌐 번역 중", unit="batch")
t0 = time.time()

with ThreadPoolExecutor(max_workers=WORKERS) as pool:
    futures = {pool.submit(translate_batch, b): i
               for i, b in enumerate(batches)}
    for future in as_completed(futures):
        local, errs = future.result()
        translations.update(local)
        errors += errs
        pbar.update(1)
        done = len(translations)
        elapsed = time.time() - t0
        speed = done / elapsed if elapsed > 0 else 0
        remain = (len(unique_texts) - done) / speed if speed > 0 else 0
        pbar.set_postfix({
            "완료": f"{done:,}",
            "속도": f"{speed:.0f}건/s",
            "남은시간": f"{remain:.0f}s",
            "오류": errors
        })
pbar.close()

elapsed = time.time() - t0
# ── 3) translations.json 저장 ──
out_path = OUTPUT_DIR / "translations.json"
out_path.write_text(
    json.dumps(translations, ensure_ascii=False, indent=0),
    encoding="utf-8"
)
ok = sum(1 for v in translations.values() if v)
print(f"\\n✅ 번역 완료! {ok:,}/{len(unique_texts):,}건 성공")
print(f"⏱️ 소요시간: {elapsed:.0f}초 ({elapsed/60:.1f}분)")
print(f"⚡ 속도: {len(unique_texts)/elapsed:.0f}건/초")
if errors: print(f"⚠️ 오류: {errors}건")
print(f"📁 저장: {out_path}")
print(f"💡 이 파일이 zip에 포함되면 분석기에서 완전한 한국어 번역이 표시됩니다.")
"""

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 셀 4: zip 다운로드
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CELL_4 = """
import shutil
from google.colab import files
zip_name = f"qoo10_reviews_{GOODS_CODE}"
shutil.make_archive(zip_name, 'zip', f"output/{GOODS_CODE}")
print(f"📦 {zip_name}.zip 다운로드 중...")
print("  포함 파일: p*.html + meta.json + translations.json")
files.download(f"{zip_name}.zip")
"""

if __name__ == "__main__":
    print("=" * 60)
    print("Qoo10 JP 리뷰 크롤러 + 번역 — Colab 가이드 v3.0")
    print("=" * 60)
    print("\nColab 노트북에 아래 셀을 순서대로 붙여넣으세요:\n")
    for i, (name, code) in enumerate([
        ("설치", CELL_1), ("크롤링 (병렬 배치)", CELL_2),
        ("번역 (deep-translator 고속)", CELL_3), ("zip 다운로드", CELL_4),
    ], 1):
        print(f"━━━ 셀 {i}: {name} ━━━")
        print(code.strip())
        print()
