import os import json import time import random import requests from bs4 import BeautifulSoup from tqdm import tqdm from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import Dataset # ---------------- CONFIG ---------------- SITEMAPS = [ "https://www.detik.com/sitemap.xml", "https://www.kaskus.co.id/thread/sitemap.xml", "https://www.kaskus.co.id/profile/sitemap.xml", "https://www.kaskus.co.id/topic/sitemap.xml" ] OUTPUT_FILE = "indochatbot_dataset.jsonl" MAX_DATASET_SIZE_MB = 20 HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" } # ---------------- SCRAPING ---------------- def get_sitemap_urls(sitemap_url, visited=set()): if sitemap_url in visited: return [] visited.add(sitemap_url) try: r = requests.get(sitemap_url, headers=HEADERS, timeout=10) r.raise_for_status() except Exception as e: print(f"❌ Error fetching sitemap {sitemap_url}: {e}") return [] soup = BeautifulSoup(r.text, "xml") urls = [] # Nested sitemap for sitemap in soup.find_all("sitemap"): loc = sitemap.find("loc") if loc: urls.extend(get_sitemap_urls(loc.text.strip(), visited)) # Article URLs for url_tag in soup.find_all("url"): loc = url_tag.find("loc") if loc: urls.append(loc.text.strip()) return urls def scrape_article(url): try: r = requests.get(url, headers=HEADERS, timeout=10) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") paragraphs = soup.find_all("p") text = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) if len(text.strip()) < 50: return None return {"url": url, "text": text} except Exception as e: print(f"❌ Error scraping {url}: {e}") return None # ---------------- SCRAP DATA ---------------- dataset = [] total_size_bytes = 0 all_urls = [] for sitemap in SITEMAPS: urls = get_sitemap_urls(sitemap) print(f"✅ Found {len(urls)} URLs in {sitemap}") all_urls.extend(urls) random.shuffle(all_urls) for url in tqdm(all_urls, desc="Scraping articles"): if total_size_bytes / (1024*1024) >= MAX_DATASET_SIZE_MB: print("⚠️ Dataset mencapai batas maksimal 20 MB, hentikan scraping.") break article = scrape_article(url) if article: json_line = json.dumps(article, ensure_ascii=False) total_size_bytes += len(json_line.encode("utf-8")) dataset.append(article) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for item in dataset: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"✅ Dataset saved: {OUTPUT_FILE} (total {len(dataset)} items, ~{total_size_bytes/(1024*1024):.2f} MB)") # ---------------- PREPARE DATASET FOR TRAINING ---------------- texts = [item["text"] for item in dataset] train_dataset = Dataset.from_dict({"text": texts}) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) tokenized_dataset = train_dataset.map(tokenize_function, batched=True) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) model = GPT2LMHeadModel.from_pretrained("gpt2") # ---------------- TRAINING ---------------- training_args = TrainingArguments( output_dir="./indochatbot_model", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=2, save_steps=200, save_total_limit=2, logging_steps=50, learning_rate=5e-5, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer, data_collator=data_collator ) print("🔹 Memulai training model Indo chatbot...") trainer.train() trainer.save_model("./indochatbot_model") print("🎉 Training selesai. Model & tokenizer tersimpan di ./indochatbot_model")