Youtube2Feed/src/web_server.py

"""
Flask web server - RSS-Bridge benzeri URL template sistemi
"""
from flask import Flask, request, Response, jsonify, g
from typing import Optional
import sys
import os
import yaml
import time
import logging
import random
from pathlib import Path

# Logger oluştur
logger = logging.getLogger(__name__)

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.database import Database
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id
from src.transcript_extractor import TranscriptExtractor
from src.transcript_cleaner import TranscriptCleaner
from src.rss_generator import RSSGenerator
from src.security import (
    init_security, get_security_manager,
    require_api_key, rate_limit, validate_input
)


app = Flask(__name__)

# Security config yükle
_security_config = None
def load_security_config():
    """Security config'i yükle"""
    global _security_config
    if _security_config is None:
        config_path = Path(__file__).parent.parent / 'config' / 'security.yaml'
        if config_path.exists():
            with open(config_path, 'r', encoding='utf-8') as f:
                _security_config = yaml.safe_load(f).get('security', {})
        else:
            _security_config = {}
    return _security_config

# Security manager'ı initialize et
def init_app_security():
    """Security manager'ı uygulama başlangıcında initialize et"""
    config = load_security_config()
    api_keys = config.get('api_keys', {})
    default_rate_limit = config.get('default_rate_limit', 60)
    init_security(api_keys, default_rate_limit)

# Security headers ve CORS middleware
@app.after_request
def add_security_headers(response):
    """Security header'ları ekle"""
    config = load_security_config()
    headers = config.get('security_headers', {})

    for header, value in headers.items():
        response.headers[header] = value

    # CORS headers
    cors_config = config.get('cors', {})
    if cors_config.get('enabled', True):
        origins = cors_config.get('allowed_origins', ['*'])
        if '*' in origins:
            response.headers['Access-Control-Allow-Origin'] = '*'
        else:
            origin = request.headers.get('Origin')
            if origin in origins:
                response.headers['Access-Control-Allow-Origin'] = origin

        response.headers['Access-Control-Allow-Methods'] = ', '.join(
            cors_config.get('allowed_methods', ['GET', 'OPTIONS'])
        )
        response.headers['Access-Control-Allow-Headers'] = ', '.join(
            cors_config.get('allowed_headers', ['Content-Type', 'X-API-Key'])
        )

    # Rate limit bilgisini header'a ekle
    if hasattr(g, 'rate_limit_remaining'):
        response.headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining)

    return response

# OPTIONS handler for CORS
@app.route('/', methods=['OPTIONS'])
@app.route('/<path:path>', methods=['OPTIONS'])
def handle_options(path=None):
    """CORS preflight request handler"""
    return Response(status=200)

# Uygulama başlangıcında security'yi initialize et
init_app_security()

# Global instances (lazy loading)
db = None
extractor = None
cleaner = None


def get_db():
    """Database instance'ı al (singleton)"""
    global db
    if db is None:
        db = Database()
        db.init_database()
    return db


def get_extractor():
    """Transcript extractor instance'ı al"""
    global extractor
    if extractor is None:
        extractor = TranscriptExtractor()
    return extractor


def get_cleaner():
    """Transcript cleaner instance'ı al"""
    global cleaner
    if cleaner is None:
        cleaner = TranscriptCleaner()
    return cleaner


def normalize_channel_id(channel_id: Optional[str] = None,
                        channel: Optional[str] = None,
                        channel_url: Optional[str] = None) -> Optional[str]:
    """
    Farklı formatlardan channel ID'yi normalize et

    Args:
        channel_id: Direkt Channel ID (UC...)
        channel: Channel handle (@username) veya username
        channel_url: Full YouTube channel URL

    Returns:
        Normalize edilmiş Channel ID veya None
    """
    # Direkt Channel ID varsa
    if channel_id:
        if channel_id.startswith('UC') and len(channel_id) == 24:
            return channel_id
        # Eğer URL formatında ise parse et
        if 'youtube.com/channel/' in channel_id:
            parts = channel_id.split('/channel/')
            if len(parts) > 1:
                return parts[-1].split('?')[0].split('/')[0]

    # Channel handle (@username)
    if channel:
        if not channel.startswith('@'):
            channel = f"@{channel}"
        handle_url = f"https://www.youtube.com/{channel}"
        return get_channel_id_from_handle(handle_url)

    # Channel URL
    if channel_url:
        # Handle URL
        if '/@' in channel_url:
            return get_channel_id_from_handle(channel_url)
        # Channel ID URL
        elif '/channel/' in channel_url:
            parts = channel_url.split('/channel/')
            if len(parts) > 1:
                return parts[-1].split('?')[0].split('/')[0]

    return None


def process_channel(channel_id: str, max_items: int = 50) -> dict:
    """
    Kanal için transcript feed'i oluştur

    Returns:
        RSS feed string ve metadata
    """
    db = get_db()
    extractor = get_extractor()
    cleaner = get_cleaner()

    # RSS-Bridge'den videoları çek (max_items'ın 2 katı kadar çek, böylece yeterli video olur)
    # RSS-Bridge'den daha fazla video çekiyoruz çünkü bazıları transcript'siz olabilir
    rss_bridge_limit = max(max_items * 2, 50)  # En az 50 video çek
    logger.info(f"[PROCESS] Channel {channel_id} için RSS-Bridge'den video listesi çekiliyor (limit: {rss_bridge_limit})")

    try:
        videos = fetch_videos_from_rss_bridge(
            base_url="https://rss-bridge.org/bridge01",
            channel_id=channel_id,
            format="Atom",
            max_items=rss_bridge_limit
        )
        logger.info(f"[PROCESS] RSS-Bridge'den {len(videos)} video alındı")
    except Exception as e:
        logger.error(f"[PROCESS] ❌ RSS-Bridge hatası: {type(e).__name__} - {str(e)}")
        raise Exception(f"RSS-Bridge hatası: {e}")

    # Yeni videoları veritabanına ekle
    new_videos_count = 0
    for video in videos:
        video['channel_id'] = channel_id
        if not db.is_video_processed(video['video_id']):
            db.add_video(video)
            new_videos_count += 1

    if new_videos_count > 0:
        logger.info(f"[PROCESS] {new_videos_count} yeni video veritabanına eklendi")
    else:
        logger.debug(f"[PROCESS] Tüm videolar zaten veritabanında")

    # Bekleyen videoları işle (max_items kadar, küçük batch'ler halinde)
    # YouTube IP blocking'i önlemek için her batch'te sadece 5 video işlenir
    # max_items: Her istekte kaç video transcript işleneceği (maksimum 100)
    batch_size = 5  # Her batch'te işlenecek video sayısı (küçük batch = daha az blocking riski)
    processed_count = 0  # İşlenen transcript sayısı

    # Tüm bekleyen videoları al (channel_id'ye göre filtrele)
    all_pending_videos = [v for v in db.get_pending_videos() if v['channel_id'] == channel_id]
    logger.info(f"[PROCESS] Channel {channel_id} için {len(all_pending_videos)} bekleyen video bulundu (max_items: {max_items})")

    # max_items kadar transcript işlenene kadar batch'ler halinde işle
    total_batches = (len(all_pending_videos) + batch_size - 1) // batch_size
    current_batch = 0

    for batch_start in range(0, len(all_pending_videos), batch_size):
        if processed_count >= max_items:
            logger.info(f"[PROCESS] Maksimum transcript sayısına ulaşıldı ({processed_count}/{max_items})")
            break

        current_batch += 1
        batch_videos = all_pending_videos[batch_start:batch_start + batch_size]
        logger.info(f"[BATCH] Batch {current_batch}/{total_batches} başlatılıyor ({len(batch_videos)} video, Toplam işlenen: {processed_count}/{max_items})")

        batch_processed = 0
        batch_cached = 0
        batch_failed = 0

        for video in batch_videos:
            if processed_count >= max_items:
                break

            video_id = video['video_id']
            video_title = video.get('video_title', 'N/A')[:50]

            # Cache kontrolü: 3 gün içinde işlenmiş transcript varsa atla
            if db.is_transcript_cached(video_id, cache_days=3):
                logger.debug(f"[CACHE] Video {video_id} ({video_title}) transcript'i cache'de, atlanıyor")
                batch_cached += 1
                continue

            try:
                logger.info(f"[VIDEO] Video işleniyor: {video_id} - {video_title}")

                # Transcript çıkar
                transcript = extractor.fetch_transcript(
                    video_id,
                    languages=['tr', 'en']
                )

                if transcript:
                    # Transcript temizle
                    logger.debug(f"[VIDEO] Video {video_id} transcript'i temizleniyor...")
                    raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3)

                    # Veritabanına kaydet (her batch hemen kaydedilir)
                    db.update_video_transcript(
                        video_id,
                        raw,
                        clean,
                        status=1,
                        language='tr'
                    )
                    processed_count += 1
                    batch_processed += 1
                    logger.info(f"[VIDEO] ✅ Video {video_id} başarıyla işlendi ve kaydedildi ({processed_count}/{max_items})")
                else:
                    logger.warning(f"[VIDEO] ⚠️ Video {video_id} transcript'i alınamadı (None döndü)")
                    batch_failed += 1
                    db.mark_video_failed(video_id, "Transcript None döndü")
            except Exception as e:
                error_type = type(e).__name__
                error_msg = str(e)[:200]
                logger.error(f"[VIDEO] ❌ Video {video_id} işleme hatası: {error_type} - {error_msg}")
                db.mark_video_failed(video_id, str(e))
                batch_failed += 1

        # Batch özeti
        logger.info(f"[BATCH] Batch {current_batch}/{total_batches} tamamlandı - İşlenen: {batch_processed}, Cache: {batch_cached}, Başarısız: {batch_failed}")

        # Batch tamamlandı, uzun bekleme (YouTube IP blocking önleme için)
        if processed_count < max_items and batch_start + batch_size < len(all_pending_videos):
            # Blocking varsa daha uzun bekle
            wait_time = 60 + random.uniform(0, 30)  # 60-90 saniye random (human-like)
            logger.info(f"[BATCH] Batch'ler arası bekleme: {wait_time:.1f} saniye ({wait_time/60:.1f} dakika) - YouTube IP blocking önleme")
            time.sleep(wait_time)

    # İşlenmiş videoları getir
    processed_videos = db.get_processed_videos(
        limit=max_items,
        channel_id=channel_id
    )

    logger.info(f"[PROCESS] ✅ Channel {channel_id} işleme tamamlandı - {len(processed_videos)} işlenmiş video döndürülüyor")

    return {
        'videos': processed_videos,
        'channel_id': channel_id,
        'count': len(processed_videos)
    }


@app.route('/', methods=['GET'])
@require_api_key  # API key zorunlu
@validate_input  # Input validation
def generate_feed():
    """
    RSS-Bridge benzeri URL template:

    Örnekler:
    - /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom
    - /?channel=@tavakfi&format=Atom
    - /?channel_url=https://www.youtube.com/@tavakfi&format=Atom
    """
    # Query parametrelerini al (validate_input decorator zaten sanitize etti)
    channel_id = request.args.get('channel_id')
    channel = request.args.get('channel')  # @username veya username
    channel_url = request.args.get('channel_url')
    format_type = request.args.get('format', 'Atom').lower()  # Atom veya Rss
    try:
        max_items = int(request.args.get('max_items', 10))  # Default: 10 transcript
        # Maksimum 100 transcript (20'şer batch'ler halinde işlenir)
        max_items = min(max_items, 100)
    except (ValueError, TypeError):
        max_items = 10

    # Channel ID'yi normalize et
    normalized_channel_id = normalize_channel_id(
        channel_id=channel_id,
        channel=channel,
        channel_url=channel_url
    )

    if not normalized_channel_id:
        return jsonify({
            'error': 'Channel ID bulunamadı',
                'usage': {
                'channel_id': 'UC... (YouTube Channel ID)',
                'channel': '@username veya username',
                'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...',
                'format': 'Atom veya Rss (varsayılan: Atom)',
                'max_items': 'Maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)'
            }
        }), 400

    try:
        # Kanalı işle
        result = process_channel(normalized_channel_id, max_items=max_items)

        if not result['videos']:
            return jsonify({
                'error': 'Henüz işlenmiş video yok',
                'channel_id': normalized_channel_id,
                'message': 'Transcript\'ler arka planda işleniyor. Lütfen birkaç dakika sonra tekrar deneyin.',
                'note': 'YouTube IP blocking nedeniyle transcript çıkarımı yavaş olabilir. İlk istekte birkaç dakika bekleyin.'
            }), 404

        # RSS feed oluştur
        channel_info = {
            'id': normalized_channel_id,
            'title': f"YouTube Transcript Feed - {normalized_channel_id}",
            'link': f"https://www.youtube.com/channel/{normalized_channel_id}",
            'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}',
            'language': 'en'
        }

        generator = RSSGenerator(channel_info)

        for video in result['videos']:
            generator.add_video_entry(video)

        # Format'a göre döndür
        response_headers = {}
        if hasattr(g, 'rate_limit_remaining'):
            response_headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining)

        if format_type == 'rss':
            rss_content = generator.generate_rss_string()
            response_headers['Content-Type'] = 'application/rss+xml; charset=utf-8'
            return Response(
                rss_content,
                mimetype='application/rss+xml',
                headers=response_headers
            )
        else:  # Atom
            # Feedgen Atom desteği
            atom_content = generator.generate_atom_string()
            response_headers['Content-Type'] = 'application/atom+xml; charset=utf-8'
            return Response(
                atom_content,
                mimetype='application/atom+xml',
                headers=response_headers
            )

    except Exception as e:
        return jsonify({
            'error': str(e),
            'channel_id': normalized_channel_id
        }), 500


@app.route('/health', methods=['GET'])
@rate_limit(limit_per_minute=120)  # Health check için daha yüksek limit
def health():
    """Health check endpoint"""
    return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'})


@app.route('/info', methods=['GET'])
@require_api_key  # API key zorunlu
def info():
    """API bilgileri"""
    return jsonify({
        'service': 'YouTube Transcript RSS Feed Generator',
        'version': '1.0.0',
        'endpoints': {
            '/': 'RSS Feed Generator',
            '/health': 'Health Check',
            '/info': 'API Info'
        },
        'usage': {
            'channel_id': 'UC... (YouTube Channel ID)',
            'channel': '@username veya username',
            'channel_url': 'Full YouTube channel URL',
            'format': 'Atom veya Rss (varsayılan: Atom)',
            'max_items': 'Her istekte işlenecek maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)'
        },
        'examples': [
            '/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom',
            '/?channel=@tavakfi&format=Rss',
            '/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=50'
        ]
    })


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)