""" Flask web server - RSS-Bridge benzeri URL template sistemi """ from flask import Flask, request, Response, jsonify, g from typing import Optional import sys import os import yaml from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.database import Database from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id from src.transcript_extractor import TranscriptExtractor from src.transcript_cleaner import TranscriptCleaner from src.rss_generator import RSSGenerator from src.security import ( init_security, get_security_manager, require_api_key, rate_limit, validate_input ) app = Flask(__name__) # Security config yükle _security_config = None def load_security_config(): """Security config'i yükle""" global _security_config if _security_config is None: config_path = Path(__file__).parent.parent / 'config' / 'security.yaml' if config_path.exists(): with open(config_path, 'r', encoding='utf-8') as f: _security_config = yaml.safe_load(f).get('security', {}) else: _security_config = {} return _security_config # Security manager'ı initialize et def init_app_security(): """Security manager'ı uygulama başlangıcında initialize et""" config = load_security_config() api_keys = config.get('api_keys', {}) default_rate_limit = config.get('default_rate_limit', 60) init_security(api_keys, default_rate_limit) # Security headers ve CORS middleware @app.after_request def add_security_headers(response): """Security header'ları ekle""" config = load_security_config() headers = config.get('security_headers', {}) for header, value in headers.items(): response.headers[header] = value # CORS headers cors_config = config.get('cors', {}) if cors_config.get('enabled', True): origins = cors_config.get('allowed_origins', ['*']) if '*' in origins: response.headers['Access-Control-Allow-Origin'] = '*' else: origin = request.headers.get('Origin') if origin in origins: response.headers['Access-Control-Allow-Origin'] = origin response.headers['Access-Control-Allow-Methods'] = ', '.join( cors_config.get('allowed_methods', ['GET', 'OPTIONS']) ) response.headers['Access-Control-Allow-Headers'] = ', '.join( cors_config.get('allowed_headers', ['Content-Type', 'X-API-Key']) ) # Rate limit bilgisini header'a ekle if hasattr(g, 'rate_limit_remaining'): response.headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining) return response # OPTIONS handler for CORS @app.route('/', methods=['OPTIONS']) @app.route('/', methods=['OPTIONS']) def handle_options(path=None): """CORS preflight request handler""" return Response(status=200) # Uygulama başlangıcında security'yi initialize et init_app_security() # Global instances (lazy loading) db = None extractor = None cleaner = None def get_db(): """Database instance'ı al (singleton)""" global db if db is None: db = Database() db.init_database() return db def get_extractor(): """Transcript extractor instance'ı al""" global extractor if extractor is None: extractor = TranscriptExtractor() return extractor def get_cleaner(): """Transcript cleaner instance'ı al""" global cleaner if cleaner is None: cleaner = TranscriptCleaner() return cleaner def normalize_channel_id(channel_id: Optional[str] = None, channel: Optional[str] = None, channel_url: Optional[str] = None) -> Optional[str]: """ Farklı formatlardan channel ID'yi normalize et Args: channel_id: Direkt Channel ID (UC...) channel: Channel handle (@username) veya username channel_url: Full YouTube channel URL Returns: Normalize edilmiş Channel ID veya None """ # Direkt Channel ID varsa if channel_id: if channel_id.startswith('UC') and len(channel_id) == 24: return channel_id # Eğer URL formatında ise parse et if 'youtube.com/channel/' in channel_id: parts = channel_id.split('/channel/') if len(parts) > 1: return parts[-1].split('?')[0].split('/')[0] # Channel handle (@username) if channel: if not channel.startswith('@'): channel = f"@{channel}" handle_url = f"https://www.youtube.com/{channel}" return get_channel_id_from_handle(handle_url) # Channel URL if channel_url: # Handle URL if '/@' in channel_url: return get_channel_id_from_handle(channel_url) # Channel ID URL elif '/channel/' in channel_url: parts = channel_url.split('/channel/') if len(parts) > 1: return parts[-1].split('?')[0].split('/')[0] return None def process_channel(channel_id: str, max_items: int = 50) -> dict: """ Kanal için transcript feed'i oluştur Returns: RSS feed string ve metadata """ db = get_db() extractor = get_extractor() cleaner = get_cleaner() # RSS-Bridge'den videoları çek try: videos = fetch_videos_from_rss_bridge( base_url="https://rss-bridge.org/bridge01", channel_id=channel_id, format="Atom", max_items=max_items ) except Exception as e: raise Exception(f"RSS-Bridge hatası: {e}") # Yeni videoları veritabanına ekle for video in videos: video['channel_id'] = channel_id if not db.is_video_processed(video['video_id']): db.add_video(video) # Bekleyen videoları işle (YouTube IP blocking'i önlemek için sadece 5 video) pending_videos = db.get_pending_videos()[:5] for video in pending_videos: if video['channel_id'] != channel_id: continue # Cache kontrolü: 3 gün içinde işlenmiş transcript varsa atla if db.is_transcript_cached(video['video_id'], cache_days=3): print(f"Video {video['video_id']} transcript'i cache'de (3 gün içinde işlenmiş), atlanıyor") continue try: # Transcript çıkar transcript = extractor.fetch_transcript( video['video_id'], languages=['tr', 'en'] ) if transcript: # Transcript temizle raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3) # Veritabanına kaydet db.update_video_transcript( video['video_id'], raw, clean, status=1, language='tr' ) except Exception as e: print(f"Transcript çıkarım hatası {video['video_id']}: {e}") db.mark_video_failed(video['video_id'], str(e)) # İşlenmiş videoları getir processed_videos = db.get_processed_videos( limit=max_items, channel_id=channel_id ) return { 'videos': processed_videos, 'channel_id': channel_id, 'count': len(processed_videos) } @app.route('/', methods=['GET']) @require_api_key # API key zorunlu @validate_input # Input validation def generate_feed(): """ RSS-Bridge benzeri URL template: Örnekler: - /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom - /?channel=@tavakfi&format=Atom - /?channel_url=https://www.youtube.com/@tavakfi&format=Atom """ # Query parametrelerini al (validate_input decorator zaten sanitize etti) channel_id = request.args.get('channel_id') channel = request.args.get('channel') # @username veya username channel_url = request.args.get('channel_url') format_type = request.args.get('format', 'Atom').lower() # Atom veya Rss try: max_items = int(request.args.get('max_items', 50)) except (ValueError, TypeError): max_items = 50 # Channel ID'yi normalize et normalized_channel_id = normalize_channel_id( channel_id=channel_id, channel=channel, channel_url=channel_url ) if not normalized_channel_id: return jsonify({ 'error': 'Channel ID bulunamadı', 'usage': { 'channel_id': 'UC... (YouTube Channel ID)', 'channel': '@username veya username', 'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...', 'format': 'Atom veya Rss (varsayılan: Atom)', 'max_items': 'Maksimum video sayısı (varsayılan: 50)' } }), 400 try: # Kanalı işle result = process_channel(normalized_channel_id, max_items=max_items) if not result['videos']: return jsonify({ 'error': 'Henüz işlenmiş video yok', 'channel_id': normalized_channel_id, 'message': 'Transcript\'ler arka planda işleniyor. Lütfen birkaç dakika sonra tekrar deneyin.', 'note': 'YouTube IP blocking nedeniyle transcript çıkarımı yavaş olabilir. İlk istekte birkaç dakika bekleyin.' }), 404 # RSS feed oluştur channel_info = { 'id': normalized_channel_id, 'title': f"YouTube Transcript Feed - {normalized_channel_id}", 'link': f"https://www.youtube.com/channel/{normalized_channel_id}", 'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}', 'language': 'en' } generator = RSSGenerator(channel_info) for video in result['videos']: generator.add_video_entry(video) # Format'a göre döndür response_headers = {} if hasattr(g, 'rate_limit_remaining'): response_headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining) if format_type == 'rss': rss_content = generator.generate_rss_string() response_headers['Content-Type'] = 'application/rss+xml; charset=utf-8' return Response( rss_content, mimetype='application/rss+xml', headers=response_headers ) else: # Atom # Feedgen Atom desteği atom_content = generator.generate_atom_string() response_headers['Content-Type'] = 'application/atom+xml; charset=utf-8' return Response( atom_content, mimetype='application/atom+xml', headers=response_headers ) except Exception as e: return jsonify({ 'error': str(e), 'channel_id': normalized_channel_id }), 500 @app.route('/health', methods=['GET']) @rate_limit(limit_per_minute=120) # Health check için daha yüksek limit def health(): """Health check endpoint""" return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'}) @app.route('/info', methods=['GET']) @require_api_key # API key zorunlu def info(): """API bilgileri""" return jsonify({ 'service': 'YouTube Transcript RSS Feed Generator', 'version': '1.0.0', 'endpoints': { '/': 'RSS Feed Generator', '/health': 'Health Check', '/info': 'API Info' }, 'usage': { 'channel_id': 'UC... (YouTube Channel ID)', 'channel': '@username veya username', 'channel_url': 'Full YouTube channel URL', 'format': 'Atom veya Rss (varsayılan: Atom)', 'max_items': 'Maksimum video sayısı (varsayılan: 50)' }, 'examples': [ '/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom', '/?channel=@tavakfi&format=Rss', '/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=100' ] }) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000, debug=True)