""" GabForge — Train All 3 Models on Vast.ai ========================================= Trains sequentially on a single GPU (96GB RTX PRO 6000): 1. GabForge Mini Vision (Qwen 3.5 9B + ViT) — ~3-4 hours 2. GabForge PA v1 (QwQ 32B) — ~6-8 hours 3. GabForge Pro Vision (Qwen 3.5 27B + ViT) — ~8-12 hours Total: ~17-24 hours on 96GB VRAM Usage: pip install unsloth torch transformers datasets peft trl bitsandbytes accelerate python train_all_vastai.py """ import json import os import sys import time from pathlib import Path # ─── Check dependencies ───────────────────── def check_deps(): missing = [] for pkg in ["torch", "transformers", "datasets", "peft", "trl", "unsloth"]: try: __import__(pkg) except ImportError: missing.append(pkg) if missing: print(f"Missing: {', '.join(missing)}") print("Run: pip install unsloth torch transformers datasets peft trl bitsandbytes accelerate") sys.exit(1) check_deps() import torch from datasets import Dataset from transformers import TrainingArguments from unsloth import FastLanguageModel from trl import SFTTrainer DATA_DIR = Path("/workspace/data") OUTPUT_DIR = Path("/workspace/output") # ─── Shared utilities ──────────────────────── def load_text_dataset(path: Path, split_ratio: float = 0.95): """Load JSONL chat dataset for text model training.""" examples = [] with open(path) as f: for line in f: if line.strip(): examples.append(json.loads(line)) texts = [] for ex in examples: parts = [] for msg in ex.get("messages", []): role, content = msg["role"], msg["content"] if role == "system": parts.append(f"<|im_start|>system\n{content}<|im_end|>") elif role == "user": parts.append(f"<|im_start|>user\n{content}<|im_end|>") elif role == "assistant": parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") texts.append({"text": "\n".join(parts)}) split_idx = int(len(texts) * split_ratio) return Dataset.from_list(texts[:split_idx]), Dataset.from_list(texts[split_idx:]) def load_vision_dataset(path: Path, split_ratio: float = 0.95): """Load JSONL vision dataset.""" examples = [] with open(path) as f: for line in f: if line.strip(): examples.append(json.loads(line)) texts = [{"text": json.dumps(ex)} for ex in examples] split_idx = int(len(texts) * split_ratio) return Dataset.from_list(texts[:split_idx]), Dataset.from_list(texts[split_idx:]) def train_model( model_name: str, base_model: str, dataset_path: Path, output_name: str, max_seq_length: int = 4096, epochs: int = 3, batch_size: int = 2, grad_accum: int = 4, lr: float = 2e-4, lora_rank: int = 16, lora_alpha: int = 32, is_vision: bool = False, ): """Train a single model with QLoRA.""" print("\n" + "=" * 60) print(f" Training: {model_name}") print(f" Base: {base_model}") print(f" Dataset: {dataset_path}") print(f" Output: {output_name}") print(f" Seq Length: {max_seq_length}") print(f" Epochs: {epochs}, Batch: {batch_size}x{grad_accum}={batch_size * grad_accum}") print("=" * 60) start = time.time() output_path = OUTPUT_DIR / output_name # Load dataset print("\n[1/4] Loading dataset...") if is_vision: train_data, eval_data = load_vision_dataset(dataset_path) else: train_data, eval_data = load_text_dataset(dataset_path) print(f" {len(train_data)} train, {len(eval_data)} eval") # Load model print("\n[2/4] Loading model (4-bit quantized)...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=base_model, max_seq_length=max_seq_length, dtype=torch.bfloat16, load_in_4bit=True, ) # Apply LoRA print("\n[3/4] Applying LoRA adapters...") model = FastLanguageModel.get_peft_model( model, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=0.05, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias="none", ) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f" Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") # Train print("\n[4/4] Training...") output_path.mkdir(parents=True, exist_ok=True) training_args = TrainingArguments( output_dir=str(output_path), num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=grad_accum, learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type="cosine", logging_steps=5, save_steps=100, eval_steps=100, eval_strategy="steps", save_total_limit=2, bf16=True, gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}, max_grad_norm=1.0, optim="paged_adamw_8bit", report_to="none", seed=42, dataloader_num_workers=4, ) trainer = SFTTrainer( model=model, args=training_args, train_dataset=train_data, eval_dataset=eval_data, tokenizer=tokenizer, max_seq_length=max_seq_length, dataset_text_field="text", packing=True, ) trainer.train() trainer.save_model() elapsed = (time.time() - start) / 3600 print(f"\n✅ {model_name} complete! ({elapsed:.1f} hours)") print(f" Saved to: {output_path}") # Free GPU memory del model, trainer torch.cuda.empty_cache() return output_path # ─── Main ──────────────────────────────────── def main(): print("╔══════════════════════════════════════════════════════╗") print("║ GabForge AI — Train All 3 Models ║") print("║ GPU: RTX PRO 6000 (96GB VRAM) ║") print("╚══════════════════════════════════════════════════════╝") gpu = torch.cuda.get_device_name(0) vram = torch.cuda.get_device_properties(0).total_memory / 1024**3 print(f"\nGPU: {gpu} ({vram:.0f} GB)") total_start = time.time() results = [] # ── Model 1: GabForge Mini Vision (fastest, validates setup) ── pa_data = DATA_DIR / "training_data.jsonl" vision_data = DATA_DIR / "train_websight_6300.jsonl" if vision_data.exists(): r = train_model( model_name="GabForge Mini Vision", base_model="Qwen/Qwen3.5-Coder-9B-Instruct", dataset_path=vision_data, output_name="gabforge-mini-vision-v2", max_seq_length=4096, epochs=3, batch_size=4, grad_accum=2, lr=1e-4, lora_rank=16, lora_alpha=32, is_vision=True, ) results.append(("Mini Vision", r)) else: print(f"\n⚠️ Skipping Mini Vision — {vision_data} not found") # ── Model 2: GabForge PA v1 ── if pa_data.exists(): r = train_model( model_name="GabForge PA v1", base_model="Qwen/QwQ-32B", dataset_path=pa_data, output_name="gabforge-pa-v1", max_seq_length=4096, epochs=3, batch_size=2, grad_accum=4, lr=2e-4, lora_rank=16, lora_alpha=32, ) results.append(("PA v1", r)) else: print(f"\n⚠️ Skipping PA — {pa_data} not found") # ── Model 3: GabForge Pro Vision ── if vision_data.exists(): r = train_model( model_name="GabForge Pro Vision", base_model="Qwen/Qwen3.5-Coder-27B-Instruct", dataset_path=vision_data, output_name="gabforge-pro-vision-v1", max_seq_length=4096, epochs=2, batch_size=2, grad_accum=4, lr=5e-5, lora_rank=32, lora_alpha=64, is_vision=True, ) results.append(("Pro Vision", r)) else: print(f"\n⚠️ Skipping Pro Vision — {vision_data} not found") # ── Summary ── total_hours = (time.time() - total_start) / 3600 print("\n" + "=" * 60) print(" TRAINING COMPLETE") print("=" * 60) for name, path in results: print(f" ✅ {name}: {path}") print(f"\n Total time: {total_hours:.1f} hours") print(f" Output dir: {OUTPUT_DIR}") print("\n Next: Download output/ folder to local machine") print(" Then: Convert to GGUF for deployment") if __name__ == "__main__": main()