""" Flask web server - RSS-Bridge benzeri URL template sistemi """ from flask import Flask, request, Response, jsonify, g from typing import Optional import sys import os import yaml import time import logging from pathlib import Path # Logger oluştur logger = logging.getLogger(__name__) sys.path.insert(0, str(Path(__file__).parent.parent)) from src.database import Database from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id from src.transcript_extractor import TranscriptExtractor from src.transcript_cleaner import TranscriptCleaner from src.rss_generator import RSSGenerator from src.security import ( init_security, get_security_manager, require_api_key, rate_limit, validate_input ) app = Flask(__name__) # Security config yükle _security_config = None def load_security_config(): """Security config'i yükle""" global _security_config if _security_config is None: config_path = Path(__file__).parent.parent / 'config' / 'security.yaml' if config_path.exists(): with open(config_path, 'r', encoding='utf-8') as f: _security_config = yaml.safe_load(f).get('security', {}) else: _security_config = {} return _security_config # Security manager'ı initialize et def init_app_security(): """Security manager'ı uygulama başlangıcında initialize et""" config = load_security_config() api_keys = config.get('api_keys', {}) default_rate_limit = config.get('default_rate_limit', 60) init_security(api_keys, default_rate_limit) # Security headers ve CORS middleware @app.after_request def add_security_headers(response): """Security header'ları ekle""" config = load_security_config() headers = config.get('security_headers', {}) for header, value in headers.items(): response.headers[header] = value # CORS headers cors_config = config.get('cors', {}) if cors_config.get('enabled', True): origins = cors_config.get('allowed_origins', ['*']) if '*' in origins: response.headers['Access-Control-Allow-Origin'] = '*' else: origin = request.headers.get('Origin') if origin in origins: response.headers['Access-Control-Allow-Origin'] = origin response.headers['Access-Control-Allow-Methods'] = ', '.join( cors_config.get('allowed_methods', ['GET', 'OPTIONS']) ) response.headers['Access-Control-Allow-Headers'] = ', '.join( cors_config.get('allowed_headers', ['Content-Type', 'X-API-Key']) ) # Rate limit bilgisini header'a ekle if hasattr(g, 'rate_limit_remaining'): response.headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining) return response # OPTIONS handler for CORS @app.route('/', methods=['OPTIONS']) @app.route('/', methods=['OPTIONS']) def handle_options(path=None): """CORS preflight request handler""" return Response(status=200) # Uygulama başlangıcında security'yi initialize et init_app_security() # Global instances (lazy loading) db = None extractor = None cleaner = None def get_db(): """Database instance'ı al (singleton)""" global db if db is None: db = Database() db.init_database() return db def get_extractor(): """Transcript extractor instance'ı al""" global extractor if extractor is None: extractor = TranscriptExtractor() return extractor def get_cleaner(): """Transcript cleaner instance'ı al""" global cleaner if cleaner is None: cleaner = TranscriptCleaner() return cleaner def normalize_channel_id(channel_id: Optional[str] = None, channel: Optional[str] = None, channel_url: Optional[str] = None) -> Optional[str]: """ Farklı formatlardan channel ID'yi normalize et Args: channel_id: Direkt Channel ID (UC...) channel: Channel handle (@username) veya username channel_url: Full YouTube channel URL Returns: Normalize edilmiş Channel ID veya None """ # Direkt Channel ID varsa if channel_id: if channel_id.startswith('UC') and len(channel_id) == 24: return channel_id # Eğer URL formatında ise parse et if 'youtube.com/channel/' in channel_id: parts = channel_id.split('/channel/') if len(parts) > 1: return parts[-1].split('?')[0].split('/')[0] # Channel handle (@username) if channel: if not channel.startswith('@'): channel = f"@{channel}" handle_url = f"https://www.youtube.com/{channel}" return get_channel_id_from_handle(handle_url) # Channel URL if channel_url: # Handle URL if '/@' in channel_url: return get_channel_id_from_handle(channel_url) # Channel ID URL elif '/channel/' in channel_url: parts = channel_url.split('/channel/') if len(parts) > 1: return parts[-1].split('?')[0].split('/')[0] return None def process_channel(channel_id: str, max_items: int = 50) -> dict: """ Kanal için transcript feed'i oluştur Returns: RSS feed string ve metadata """ db = get_db() extractor = get_extractor() cleaner = get_cleaner() # RSS-Bridge'den videoları çek (max_items'ın 2 katı kadar çek, böylece yeterli video olur) # RSS-Bridge'den daha fazla video çekiyoruz çünkü bazıları transcript'siz olabilir rss_bridge_limit = max(max_items * 2, 50) # En az 50 video çek logger.info(f"[PROCESS] Channel {channel_id} için RSS-Bridge'den video listesi çekiliyor (limit: {rss_bridge_limit})") try: videos = fetch_videos_from_rss_bridge( base_url="https://rss-bridge.org/bridge01", channel_id=channel_id, format="Atom", max_items=rss_bridge_limit ) logger.info(f"[PROCESS] RSS-Bridge'den {len(videos)} video alındı") except Exception as e: logger.error(f"[PROCESS] ❌ RSS-Bridge hatası: {type(e).__name__} - {str(e)}") raise Exception(f"RSS-Bridge hatası: {e}") # Yeni videoları veritabanına ekle new_videos_count = 0 for video in videos: video['channel_id'] = channel_id if not db.is_video_processed(video['video_id']): db.add_video(video) new_videos_count += 1 if new_videos_count > 0: logger.info(f"[PROCESS] {new_videos_count} yeni video veritabanına eklendi") else: logger.debug(f"[PROCESS] Tüm videolar zaten veritabanında") # Bekleyen videoları işle (max_items kadar, 20'şer batch'ler halinde) # YouTube IP blocking'i önlemek için her batch'te 20 video işlenir # max_items: Her istekte kaç video transcript işleneceği (maksimum 100) batch_size = 20 # Her batch'te işlenecek video sayısı processed_count = 0 # İşlenen transcript sayısı # Tüm bekleyen videoları al (channel_id'ye göre filtrele) all_pending_videos = [v for v in db.get_pending_videos() if v['channel_id'] == channel_id] logger.info(f"[PROCESS] Channel {channel_id} için {len(all_pending_videos)} bekleyen video bulundu (max_items: {max_items})") # max_items kadar transcript işlenene kadar batch'ler halinde işle total_batches = (len(all_pending_videos) + batch_size - 1) // batch_size current_batch = 0 for batch_start in range(0, len(all_pending_videos), batch_size): if processed_count >= max_items: logger.info(f"[PROCESS] Maksimum transcript sayısına ulaşıldı ({processed_count}/{max_items})") break current_batch += 1 batch_videos = all_pending_videos[batch_start:batch_start + batch_size] logger.info(f"[BATCH] Batch {current_batch}/{total_batches} başlatılıyor ({len(batch_videos)} video, Toplam işlenen: {processed_count}/{max_items})") batch_processed = 0 batch_cached = 0 batch_failed = 0 for video in batch_videos: if processed_count >= max_items: break video_id = video['video_id'] video_title = video.get('video_title', 'N/A')[:50] # Cache kontrolü: 3 gün içinde işlenmiş transcript varsa atla if db.is_transcript_cached(video_id, cache_days=3): logger.debug(f"[CACHE] Video {video_id} ({video_title}) transcript'i cache'de, atlanıyor") batch_cached += 1 continue try: logger.info(f"[VIDEO] Video işleniyor: {video_id} - {video_title}") # Transcript çıkar transcript = extractor.fetch_transcript( video_id, languages=['tr', 'en'] ) if transcript: # Transcript temizle logger.debug(f"[VIDEO] Video {video_id} transcript'i temizleniyor...") raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3) # Veritabanına kaydet (her batch hemen kaydedilir) db.update_video_transcript( video_id, raw, clean, status=1, language='tr' ) processed_count += 1 batch_processed += 1 logger.info(f"[VIDEO] ✅ Video {video_id} başarıyla işlendi ve kaydedildi ({processed_count}/{max_items})") else: logger.warning(f"[VIDEO] ⚠️ Video {video_id} transcript'i alınamadı (None döndü)") batch_failed += 1 db.mark_video_failed(video_id, "Transcript None döndü") except Exception as e: error_type = type(e).__name__ error_msg = str(e)[:200] logger.error(f"[VIDEO] ❌ Video {video_id} işleme hatası: {error_type} - {error_msg}") db.mark_video_failed(video_id, str(e)) batch_failed += 1 # Batch özeti logger.info(f"[BATCH] Batch {current_batch}/{total_batches} tamamlandı - İşlenen: {batch_processed}, Cache: {batch_cached}, Başarısız: {batch_failed}") # Batch tamamlandı, kısa bir bekleme (rate limiting için) if processed_count < max_items and batch_start + batch_size < len(all_pending_videos): wait_time = 2 logger.debug(f"[BATCH] Batch'ler arası bekleme: {wait_time} saniye") time.sleep(wait_time) # İşlenmiş videoları getir processed_videos = db.get_processed_videos( limit=max_items, channel_id=channel_id ) logger.info(f"[PROCESS] ✅ Channel {channel_id} işleme tamamlandı - {len(processed_videos)} işlenmiş video döndürülüyor") return { 'videos': processed_videos, 'channel_id': channel_id, 'count': len(processed_videos) } @app.route('/', methods=['GET']) @require_api_key # API key zorunlu @validate_input # Input validation def generate_feed(): """ RSS-Bridge benzeri URL template: Örnekler: - /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom - /?channel=@tavakfi&format=Atom - /?channel_url=https://www.youtube.com/@tavakfi&format=Atom """ # Query parametrelerini al (validate_input decorator zaten sanitize etti) channel_id = request.args.get('channel_id') channel = request.args.get('channel') # @username veya username channel_url = request.args.get('channel_url') format_type = request.args.get('format', 'Atom').lower() # Atom veya Rss try: max_items = int(request.args.get('max_items', 10)) # Default: 10 transcript # Maksimum 100 transcript (20'şer batch'ler halinde işlenir) max_items = min(max_items, 100) except (ValueError, TypeError): max_items = 10 # Channel ID'yi normalize et normalized_channel_id = normalize_channel_id( channel_id=channel_id, channel=channel, channel_url=channel_url ) if not normalized_channel_id: return jsonify({ 'error': 'Channel ID bulunamadı', 'usage': { 'channel_id': 'UC... (YouTube Channel ID)', 'channel': '@username veya username', 'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...', 'format': 'Atom veya Rss (varsayılan: Atom)', 'max_items': 'Maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)' } }), 400 try: # Kanalı işle result = process_channel(normalized_channel_id, max_items=max_items) if not result['videos']: return jsonify({ 'error': 'Henüz işlenmiş video yok', 'channel_id': normalized_channel_id, 'message': 'Transcript\'ler arka planda işleniyor. Lütfen birkaç dakika sonra tekrar deneyin.', 'note': 'YouTube IP blocking nedeniyle transcript çıkarımı yavaş olabilir. İlk istekte birkaç dakika bekleyin.' }), 404 # RSS feed oluştur channel_info = { 'id': normalized_channel_id, 'title': f"YouTube Transcript Feed - {normalized_channel_id}", 'link': f"https://www.youtube.com/channel/{normalized_channel_id}", 'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}', 'language': 'en' } generator = RSSGenerator(channel_info) for video in result['videos']: generator.add_video_entry(video) # Format'a göre döndür response_headers = {} if hasattr(g, 'rate_limit_remaining'): response_headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining) if format_type == 'rss': rss_content = generator.generate_rss_string() response_headers['Content-Type'] = 'application/rss+xml; charset=utf-8' return Response( rss_content, mimetype='application/rss+xml', headers=response_headers ) else: # Atom # Feedgen Atom desteği atom_content = generator.generate_atom_string() response_headers['Content-Type'] = 'application/atom+xml; charset=utf-8' return Response( atom_content, mimetype='application/atom+xml', headers=response_headers ) except Exception as e: return jsonify({ 'error': str(e), 'channel_id': normalized_channel_id }), 500 @app.route('/health', methods=['GET']) @rate_limit(limit_per_minute=120) # Health check için daha yüksek limit def health(): """Health check endpoint""" return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'}) @app.route('/info', methods=['GET']) @require_api_key # API key zorunlu def info(): """API bilgileri""" return jsonify({ 'service': 'YouTube Transcript RSS Feed Generator', 'version': '1.0.0', 'endpoints': { '/': 'RSS Feed Generator', '/health': 'Health Check', '/info': 'API Info' }, 'usage': { 'channel_id': 'UC... (YouTube Channel ID)', 'channel': '@username veya username', 'channel_url': 'Full YouTube channel URL', 'format': 'Atom veya Rss (varsayılan: Atom)', 'max_items': 'Her istekte işlenecek maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)' }, 'examples': [ '/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom', '/?channel=@tavakfi&format=Rss', '/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=50' ] }) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000, debug=True)