Youtube2Feed/main.py

#!/usr/bin/env python3
"""
YouTube Transcript RSS Feed Generator - Ana Pipeline
"""
import yaml
import os
import sys
from pathlib import Path

# Proje root'unu path'e ekle
sys.path.insert(0, str(Path(__file__).parent))

from src.database import Database
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle
from src.transcript_extractor import TranscriptExtractor
from src.transcript_cleaner import TranscriptCleaner
from src.rss_generator import RSSGenerator


def load_config(config_path: str = "config/config.yaml") -> dict:
    """Config dosyasını yükle"""
    with open(config_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def get_channel_id(config: dict) -> str:
    """Config'den channel ID'yi al (handle varsa dönüştür)"""
    channel_config = config.get('channel', {})
    
    # Channel ID direkt varsa
    if channel_config.get('id'):
        return channel_config['id']
    
    # Handle URL varsa
    if channel_config.get('handle_url'):
        channel_id = get_channel_id_from_handle(channel_config['handle_url'])
        if channel_id:
            return channel_id
    
    # Handle varsa
    if channel_config.get('handle'):
        handle_url = f"https://www.youtube.com/{channel_config['handle']}"
        channel_id = get_channel_id_from_handle(handle_url)
        if channel_id:
            return channel_id
    
    raise ValueError("Channel ID bulunamadı! Config'de id, handle veya handle_url belirtin.")


def main():
    """Ana pipeline"""
    print("YouTube Transcript RSS Feed Generator başlatılıyor...")
    
    # Config yükle
    config = load_config()
    
    # Channel ID al
    channel_id = get_channel_id(config)
    print(f"Channel ID: {channel_id}")
    
    # Database başlat
    db = Database()
    db.init_database()
    
    # RSS-Bridge'den videoları çek
    rss_bridge_config = config.get('rss_bridge', {})
    print(f"RSS-Bridge'den videolar çekiliyor...")
    
    try:
        videos = fetch_videos_from_rss_bridge(
            base_url=rss_bridge_config.get('base_url', 'https://rss-bridge.org/bridge01'),
            channel_id=channel_id,
            format=rss_bridge_config.get('format', 'Atom'),
            max_items=rss_bridge_config.get('max_items', 100)
        )
        print(f"{len(videos)} video bulundu")
    except Exception as e:
        print(f"Hata: {e}")
        return
    
    # Yeni videoları veritabanına ekle
    new_count = 0
    for video in videos:
        video['channel_id'] = channel_id
        if not db.is_video_processed(video['video_id']):
            db.add_video(video)
            new_count += 1
    
    print(f"{new_count} yeni video eklendi")
    
    # Bekleyen videoları işle
    pending_videos = db.get_pending_videos()
    print(f"{len(pending_videos)} video işlenmeyi bekliyor")
    
    if pending_videos:
        extractor = TranscriptExtractor()
        cleaner = TranscriptCleaner()
        transcript_config = config.get('transcript', {})
        
        for video in pending_videos[:10]:  # İlk 10 video (test için)
            print(f"İşleniyor: {video['video_title']}")
            
            # Transcript çıkar
            transcript = extractor.fetch_transcript(
                video['video_id'],
                languages=transcript_config.get('languages', ['en'])
            )
            
            if transcript:
                # Transcript temizle
                raw, clean = cleaner.clean_transcript(
                    transcript,
                    sentences_per_paragraph=transcript_config.get('paragraph_length', 3)
                )
                
                # Veritabanına kaydet
                db.update_video_transcript(
                    video['video_id'],
                    raw,
                    clean,
                    status=1,  # Başarılı
                    language=transcript_config.get('languages', ['en'])[0]
                )
                print(f"✓ Tamamlandı: {video['video_title']}")
            else:
                # Başarısız olarak işaretle
                db.mark_video_failed(video['video_id'], "Transcript bulunamadı")
                print(f"✗ Başarısız: {video['video_title']}")
    
    # RSS feed oluştur
    processed_videos = db.get_processed_videos(
        limit=config.get('automation', {}).get('max_items', 100),
        channel_id=channel_id
    )
    
    if processed_videos:
        channel_info = {
            'id': channel_id,
            'title': config.get('rss', {}).get('title', 'Transcript Feed'),
            'link': config.get('channel', {}).get('url', ''),
            'description': config.get('rss', {}).get('description', ''),
            'language': config.get('channel', {}).get('language', 'en')
        }
        
        generator = RSSGenerator(channel_info)
        
        for video in processed_videos:
            generator.add_video_entry(video)
        
        output_file = config.get('rss', {}).get('output_file', 'transcript_feed.xml')
        output_path = f"output/{output_file}"
        os.makedirs('output', exist_ok=True)
        
        generator.generate_rss(output_path)
        print(f"RSS feed oluşturuldu: {output_path}")
    
    db.close()
    print("Tamamlandı!")


if __name__ == "__main__":
    main()