☁️ Google Colab 방식 (추천)
1
2
셀 4개 순서대로 실행
아래 코드를 각 셀에 붙여넣고 ▶ 실행하세요. URL만 바꾸면 어떤 상품이든 OK!
셀 1
설치 (최초 1회)
!pip install playwright tqdm deep-translator -q
!playwright install chromium
!playwright install-deps chromium
print("✅ 설치 완료! (Playwright + deep-translator)")
셀 2
크롤링 (병렬 배치 — 빠름!)
# ★ 여기만 바꾸세요 ★
PRODUCT_URL = "https://www.qoo10.jp/g/1102765780"
BATCH_SIZE = 10 # 동시 요청 수 (막히면 5로 줄이고, 여유있으면 20까지)
import re, asyncio, json, time, random, logging
from pathlib import Path
from tqdm.notebook import tqdm
from playwright.async_api import async_playwright
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
def extract_code(url):
for pat in [r'/(\d{8,12})(?:\?|$|#)', r'/(\d{8,12})$']:
m = re.search(pat, url.strip().rstrip('/'))
if m: return m.group(1)
if re.match(r'^\d+$', url.strip()): return url.strip()
return None
GOODS_CODE = extract_code(PRODUCT_URL)
if not GOODS_CODE:
raise ValueError(f"상품코드 추출 실패: {PRODUCT_URL}")
print(f"🔍 상품코드: {GOODS_CODE}")
print(f"🔗 URL: {PRODUCT_URL}")
OUTPUT_DIR = Path(f"output/{GOODS_CODE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
REVIEW_AJAX = "https://www.qoo10.jp/gmkt.inc/Goods/GoodsReviewAjaxAppend.aspx"
UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
async def setup_session(page):
product_name = f"상품_{GOODS_CODE}"
try:
await page.goto(PRODUCT_URL, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(3000)
try:
product_name = (
await page.locator("h1, .goods_name").first.inner_text()
).strip()[:80]
except: pass
await page.locator("li:has-text('レビュー')").first.click()
await page.wait_for_timeout(2000)
total_count = int(await page.eval_on_selector("#total_count", "el => el.value"))
page_size = int(await page.eval_on_selector("#page_size", "el => el.value"))
total_pages = (total_count + page_size - 1) // page_size
logger.info(f"상품명: {product_name} | 총 리뷰: {total_count:,}개 | {total_pages:,}페이지")
return product_name, total_count, page_size, total_pages
except Exception as e:
logger.error(f"세션 초기화 실패: {e}")
return product_name, 0, 10, 0
def build_url(pn):
return (
f"{REVIEW_AJAX}?gd_no={GOODS_CODE}&group_code=2&page_no={pn}"
f"&page_size=10&sort_type=P&contents_cnt=0"
f"&___cache_expire___={int(time.time()*1000)+pn}"
)
async def fetch_batch(page, page_numbers):
urls = [build_url(pn) for pn in page_numbers]
try:
return await page.evaluate(
"""async (urls) => Promise.all(urls.map(async url => {
try {
const r = await fetch(url, {credentials: 'include'});
return await r.text();
} catch { return ''; }
}))""",
urls,
)
except Exception as e:
logger.warning(f"배치 실패: {e}")
return [""] * len(page_numbers)
async def fetch_single(page, pn):
try:
html = await page.evaluate(
"""async url => {
const r = await fetch(url, {credentials: 'include'});
return await r.text();
}""",
build_url(pn),
)
return html if html and "<li" in html else ""
except Exception as e:
logger.warning(f"페이지 {pn} 실패: {e}")
return ""
async def run_crawler():
t0 = time.time()
name, total_count, page_size, total_pages = f"상품_{GOODS_CODE}", 0, 10, 0
browser = None
async with async_playwright() as p:
try:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(user_agent=UA, locale="ja-JP")
page = await ctx.new_page()
name, total_count, page_size, total_pages = await setup_session(page)
if total_pages == 0:
logger.error("❌ 리뷰를 찾을 수 없습니다."); return
ok, fail = 0, 0
batches = [
list(range(i, min(i + BATCH_SIZE, total_pages + 1)))
for i in range(1, total_pages + 1, BATCH_SIZE)
]
pbar = tqdm(batches, desc="📥 크롤링", unit="batch")
for batch in pbar:
results = await fetch_batch(page, batch)
retry_pages = []
for pn, html in zip(batch, results):
if html and "<li" in html:
(OUTPUT_DIR / f"p{pn}.html").write_text(html, encoding="utf-8")
ok += 1
else:
retry_pages.append(pn)
for pn in retry_pages:
saved = False
for attempt in range(3):
await asyncio.sleep((attempt + 1) * 1.5)
html = await fetch_single(page, pn)
if html:
(OUTPUT_DIR / f"p{pn}.html").write_text(html, encoding="utf-8")
ok += 1; saved = True; break
if not saved:
fail += 1; logger.error(f"페이지 {pn} 최종 실패")
await asyncio.sleep(random.uniform(0.2, 0.5))
pbar.set_postfix({"성공": ok, "실패": fail})
if ok % 500 < BATCH_SIZE:
await page.goto(PRODUCT_URL, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(1500)
finally:
if browser: await browser.close()
meta = {
"goods_code": GOODS_CODE, "product_name": name, "product_url": PRODUCT_URL,
"total_reviews": total_count, "total_pages": total_pages,
"page_size": page_size, "crawled_pages": ok, "failed_pages": fail,
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
(OUTPUT_DIR / "meta.json").write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
elapsed = time.time() - t0
print(f"\n✅ 완료! {ok:,}/{total_pages:,}페이지 | {elapsed/60:.1f}분 소요")
await run_crawler()
셀 3
🆕 리뷰 한국어 번역 (무료 자동번역)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# 셀 3: 리뷰 텍스트 한국어 번역 (고속 v3)
# deep-translator + 병렬 처리
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
import subprocess, sys, json, time, re
from pathlib import Path
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
# ── 패키지 자동 설치 ──
for pkg in ["deep-translator"]:
try: __import__(pkg.replace("-","_"))
except: subprocess.check_call([sys.executable,"-m","pip","install",pkg,"-q"])
from deep_translator import GoogleTranslator
OUTPUT_DIR = Path(f"output/{GOODS_CODE}")
# ── 1) 크롤링 HTML에서 리뷰 텍스트 추출 ──
class ReviewExtractor(HTMLParser):
def __init__(self):
super().__init__()
self._in_txt = False
self.texts = []
self._buf = ""
def handle_starttag(self, tag, attrs):
cls = dict(attrs).get("class", "")
if "review_txt" in cls:
self._in_txt = True; self._buf = ""
def handle_data(self, data):
if self._in_txt: self._buf += data
def handle_endtag(self, tag):
if self._in_txt and tag in ("p", "div", "span"):
t = self._buf.strip()
if t: self.texts.append(t)
self._in_txt = False
all_texts = []
html_files = sorted(OUTPUT_DIR.glob("p*.html"),
key=lambda f: int(re.sub(r'\D','',f.stem) or 0))
print(f"📄 {len(html_files)}개 HTML 파일에서 리뷰 추출 중...")
for f in html_files:
ext = ReviewExtractor()
ext.feed(f.read_text(encoding="utf-8", errors="ignore"))
all_texts.extend(ext.texts)
seen = set()
unique_texts = []
for t in all_texts:
if t not in seen:
seen.add(t); unique_texts.append(t)
print(f"✅ 고유 리뷰 {len(unique_texts):,}건 추출 완료")
# ── 2) 고속 배치 번역 ──
SEP = "\n〰〰〰\n"
BATCH_CHARS = 4800
WORKERS = 4
batches = []
buf_texts = []
buf_len = 0
for t in unique_texts:
add_len = len(t) + len(SEP)
if buf_len + add_len > BATCH_CHARS and buf_texts:
batches.append(buf_texts[:])
buf_texts = []; buf_len = 0
buf_texts.append(t)
buf_len += add_len
if buf_texts:
batches.append(buf_texts[:])
avg_per_batch = len(unique_texts) // max(len(batches), 1)
print(f"🚀 {len(unique_texts):,}건 → {len(batches):,}배치 "
f"(~{avg_per_batch}건/배치, {WORKERS}스레드 병렬)")
translations = {}
errors = 0
def translate_batch(batch_texts):
joined = SEP.join(batch_texts)
for attempt in range(3):
try:
tr = GoogleTranslator(source='ja', target='ko')
result = tr.translate(joined)
if not result:
raise ValueError("빈 응답")
parts = result.split("〰〰〰")
if len(parts) < len(batch_texts):
parts = result.split("~ ~ ~")
if len(parts) < len(batch_texts):
parts = result.split("~~~")
local = {}
for j, txt in enumerate(batch_texts):
local[txt] = parts[j].strip() if j < len(parts) else ""
return local, 0
except Exception as e:
if attempt < 2:
time.sleep(1.5 * (attempt + 1))
else:
local = {}
errs = 0
for txt in batch_texts:
try:
tr = GoogleTranslator(source='ja', target='ko')
local[txt] = tr.translate(txt) or ""
time.sleep(0.1)
except:
local[txt] = ""
errs += 1
return local, errs
return {t: "" for t in batch_texts}, len(batch_texts)
pbar = tqdm(total=len(batches), desc="🌐 번역 중", unit="batch")
t0 = time.time()
with ThreadPoolExecutor(max_workers=WORKERS) as pool:
futures = {pool.submit(translate_batch, b): i
for i, b in enumerate(batches)}
for future in as_completed(futures):
local, errs = future.result()
translations.update(local)
errors += errs
pbar.update(1)
done = len(translations)
elapsed = time.time() - t0
speed = done / elapsed if elapsed > 0 else 0
remain = (len(unique_texts) - done) / speed if speed > 0 else 0
pbar.set_postfix({
"완료": f"{done:,}",
"속도": f"{speed:.0f}건/s",
"남은시간": f"{remain:.0f}s",
"오류": errors
})
pbar.close()
elapsed = time.time() - t0
out_path = OUTPUT_DIR / "translations.json"
out_path.write_text(
json.dumps(translations, ensure_ascii=False, indent=0),
encoding="utf-8"
)
ok = sum(1 for v in translations.values() if v)
print(f"\n✅ 번역 완료! {ok:,}/{len(unique_texts):,}건 성공")
print(f"⏱️ 소요시간: {elapsed:.0f}초 ({elapsed/60:.1f}분)")
print(f"⚡ 속도: {len(unique_texts)/elapsed:.0f}건/초")
if errors: print(f"⚠️ 오류: {errors}건")
print(f"📁 저장: {out_path}")
print(f"💡 이 파일이 zip에 포함되면 분석기에서 완전한 한국어 번역이 표시됩니다.")
💡 선택사항이지만 강력 추천! 4스레드 병렬 + 배치 합치기로 리뷰 10,000건도 약 1~3분이면 완료됩니다.
셀 4
zip 다운로드
import shutil
from google.colab import files
zip_name = f"qoo10_reviews_{GOODS_CODE}"
shutil.make_archive(zip_name, 'zip', f"output/{GOODS_CODE}")
print(f"📦 {zip_name}.zip 다운로드 중...")
print(" 포함 파일: p*.html + meta.json + translations.json")
files.download(f"{zip_name}.zip")
3
zip 파일 업로드 → 즉시 분석!
범용 분석기를 열고, 다운로드된 .zip 파일을 그대로 업로드하세요.
✅ 이렇게 진행됩니다:
Colab에서 zip 자동 다운로드 → 분석기에 드래그&드롭 → 즉시 대시보드 생성!
📌 translations.json이 포함되면 리뷰가 완전한 한국어로 표시됩니다.
🐍 크롤러 스크립트 다운로드
Colab을 사용하시면 위 가이드만으로 충분합니다. 로컬 PC를 쓸 경우 아래 파일을 다운로드하세요.