Youtube2Feed/src/web_server.py

"""
Flask web server - RSS-Bridge benzeri URL template sistemi
"""
from flask import Flask, request, Response, jsonify
from typing import Optional
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.database import Database
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id
from src.transcript_extractor import TranscriptExtractor
from src.transcript_cleaner import TranscriptCleaner
from src.rss_generator import RSSGenerator


app = Flask(__name__)

# Global instances (lazy loading)
db = None
extractor = None
cleaner = None


def get_db():
    """Database instance'ı al (singleton)"""
    global db
    if db is None:
        db = Database()
        db.init_database()
    return db


def get_extractor():
    """Transcript extractor instance'ı al"""
    global extractor
    if extractor is None:
        extractor = TranscriptExtractor()
    return extractor


def get_cleaner():
    """Transcript cleaner instance'ı al"""
    global cleaner
    if cleaner is None:
        cleaner = TranscriptCleaner()
    return cleaner


def normalize_channel_id(channel_id: Optional[str] = None,
                        channel: Optional[str] = None,
                        channel_url: Optional[str] = None) -> Optional[str]:
    """
    Farklı formatlardan channel ID'yi normalize et
    
    Args:
        channel_id: Direkt Channel ID (UC...)
        channel: Channel handle (@username) veya username
        channel_url: Full YouTube channel URL
    
    Returns:
        Normalize edilmiş Channel ID veya None
    """
    # Direkt Channel ID varsa
    if channel_id:
        if channel_id.startswith('UC') and len(channel_id) == 24:
            return channel_id
        # Eğer URL formatında ise parse et
        if 'youtube.com/channel/' in channel_id:
            parts = channel_id.split('/channel/')
            if len(parts) > 1:
                return parts[-1].split('?')[0].split('/')[0]
    
    # Channel handle (@username)
    if channel:
        if not channel.startswith('@'):
            channel = f"@{channel}"
        handle_url = f"https://www.youtube.com/{channel}"
        return get_channel_id_from_handle(handle_url)
    
    # Channel URL
    if channel_url:
        # Handle URL
        if '/@' in channel_url:
            return get_channel_id_from_handle(channel_url)
        # Channel ID URL
        elif '/channel/' in channel_url:
            parts = channel_url.split('/channel/')
            if len(parts) > 1:
                return parts[-1].split('?')[0].split('/')[0]
    
    return None


def process_channel(channel_id: str, max_items: int = 50) -> dict:
    """
    Kanal için transcript feed'i oluştur
    
    Returns:
        RSS feed string ve metadata
    """
    db = get_db()
    extractor = get_extractor()
    cleaner = get_cleaner()
    
    # RSS-Bridge'den videoları çek
    try:
        videos = fetch_videos_from_rss_bridge(
            base_url="https://rss-bridge.org/bridge01",
            channel_id=channel_id,
            format="Atom",
            max_items=max_items
        )
    except Exception as e:
        raise Exception(f"RSS-Bridge hatası: {e}")
    
    # Yeni videoları veritabanına ekle
    for video in videos:
        video['channel_id'] = channel_id
        if not db.is_video_processed(video['video_id']):
            db.add_video(video)
    
    # Bekleyen videoları işle (ilk 20)
    pending_videos = db.get_pending_videos()[:20]
    
    for video in pending_videos:
        if video['channel_id'] != channel_id:
            continue
            
        try:
            # Transcript çıkar
            transcript = extractor.fetch_transcript(
                video['video_id'],
                languages=['tr', 'en']
            )
            
            if transcript:
                # Transcript temizle
                raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3)
                
                # Veritabanına kaydet
                db.update_video_transcript(
                    video['video_id'],
                    raw,
                    clean,
                    status=1,
                    language='tr'
                )
        except Exception as e:
            print(f"Transcript çıkarım hatası {video['video_id']}: {e}")
            db.mark_video_failed(video['video_id'], str(e))
    
    # İşlenmiş videoları getir
    processed_videos = db.get_processed_videos(
        limit=max_items,
        channel_id=channel_id
    )
    
    return {
        'videos': processed_videos,
        'channel_id': channel_id,
        'count': len(processed_videos)
    }


@app.route('/', methods=['GET'])
def generate_feed():
    """
    RSS-Bridge benzeri URL template:
    
    Örnekler:
    - /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom
    - /?channel=@tavakfi&format=Atom
    - /?channel_url=https://www.youtube.com/@tavakfi&format=Atom
    """
    # Query parametrelerini al
    channel_id = request.args.get('channel_id')
    channel = request.args.get('channel')  # @username veya username
    channel_url = request.args.get('channel_url')
    format_type = request.args.get('format', 'Atom').lower()  # Atom veya Rss
    max_items = int(request.args.get('max_items', 50))
    
    # Channel ID'yi normalize et
    normalized_channel_id = normalize_channel_id(
        channel_id=channel_id,
        channel=channel,
        channel_url=channel_url
    )
    
    if not normalized_channel_id:
        return jsonify({
            'error': 'Channel ID bulunamadı',
            'usage': {
                'channel_id': 'UC... (YouTube Channel ID)',
                'channel': '@username veya username',
                'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...',
                'format': 'Atom veya Rss (varsayılan: Atom)',
                'max_items': 'Maksimum video sayısı (varsayılan: 50)'
            }
        }), 400
    
    try:
        # Kanalı işle
        result = process_channel(normalized_channel_id, max_items=max_items)
        
        if not result['videos']:
            return jsonify({
                'error': 'Henüz işlenmiş video yok',
                'channel_id': normalized_channel_id,
                'message': 'Lütfen birkaç dakika sonra tekrar deneyin'
            }), 404
        
        # RSS feed oluştur
        channel_info = {
            'id': normalized_channel_id,
            'title': f"YouTube Transcript Feed - {normalized_channel_id}",
            'link': f"https://www.youtube.com/channel/{normalized_channel_id}",
            'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}',
            'language': 'en'
        }
        
        generator = RSSGenerator(channel_info)
        
        for video in result['videos']:
            generator.add_video_entry(video)
        
        # Format'a göre döndür
        if format_type == 'rss':
            rss_content = generator.generate_rss_string()
            return Response(
                rss_content,
                mimetype='application/rss+xml',
                headers={'Content-Type': 'application/rss+xml; charset=utf-8'}
            )
        else:  # Atom
            # Feedgen Atom desteği
            atom_content = generator.generate_atom_string()
            return Response(
                atom_content,
                mimetype='application/atom+xml',
                headers={'Content-Type': 'application/atom+xml; charset=utf-8'}
            )
            
    except Exception as e:
        return jsonify({
            'error': str(e),
            'channel_id': normalized_channel_id
        }), 500


@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'})


@app.route('/info', methods=['GET'])
def info():
    """API bilgileri"""
    return jsonify({
        'service': 'YouTube Transcript RSS Feed Generator',
        'version': '1.0.0',
        'endpoints': {
            '/': 'RSS Feed Generator',
            '/health': 'Health Check',
            '/info': 'API Info'
        },
        'usage': {
            'channel_id': 'UC... (YouTube Channel ID)',
            'channel': '@username veya username',
            'channel_url': 'Full YouTube channel URL',
            'format': 'Atom veya Rss (varsayılan: Atom)',
            'max_items': 'Maksimum video sayısı (varsayılan: 50)'
        },
        'examples': [
            '/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom',
            '/?channel=@tavakfi&format=Rss',
            '/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=100'
        ]
    })


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)