Files
Youtube2Feed/src/web_server.py
2025-11-13 05:31:43 +03:00

452 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Flask web server - RSS-Bridge benzeri URL template sistemi
"""
from flask import Flask, request, Response, jsonify, g
from typing import Optional
import sys
import os
import yaml
import time
import logging
import random
from pathlib import Path
# Logger oluştur
logger = logging.getLogger(__name__)
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.database import Database
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id
from src.transcript_extractor import TranscriptExtractor
from src.transcript_cleaner import TranscriptCleaner
from src.rss_generator import RSSGenerator
from src.security import (
init_security, get_security_manager,
require_api_key, rate_limit, validate_input
)
app = Flask(__name__)
# Security config yükle
_security_config = None
def load_security_config():
"""Security config'i yükle"""
global _security_config
if _security_config is None:
config_path = Path(__file__).parent.parent / 'config' / 'security.yaml'
if config_path.exists():
with open(config_path, 'r', encoding='utf-8') as f:
_security_config = yaml.safe_load(f).get('security', {})
else:
_security_config = {}
return _security_config
# Security manager'ı initialize et
def init_app_security():
"""Security manager'ı uygulama başlangıcında initialize et"""
config = load_security_config()
api_keys = config.get('api_keys', {})
default_rate_limit = config.get('default_rate_limit', 60)
init_security(api_keys, default_rate_limit)
# Security headers ve CORS middleware
@app.after_request
def add_security_headers(response):
"""Security header'ları ekle"""
config = load_security_config()
headers = config.get('security_headers', {})
for header, value in headers.items():
response.headers[header] = value
# CORS headers
cors_config = config.get('cors', {})
if cors_config.get('enabled', True):
origins = cors_config.get('allowed_origins', ['*'])
if '*' in origins:
response.headers['Access-Control-Allow-Origin'] = '*'
else:
origin = request.headers.get('Origin')
if origin in origins:
response.headers['Access-Control-Allow-Origin'] = origin
response.headers['Access-Control-Allow-Methods'] = ', '.join(
cors_config.get('allowed_methods', ['GET', 'OPTIONS'])
)
response.headers['Access-Control-Allow-Headers'] = ', '.join(
cors_config.get('allowed_headers', ['Content-Type', 'X-API-Key'])
)
# Rate limit bilgisini header'a ekle
if hasattr(g, 'rate_limit_remaining'):
response.headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining)
return response
# OPTIONS handler for CORS
@app.route('/', methods=['OPTIONS'])
@app.route('/<path:path>', methods=['OPTIONS'])
def handle_options(path=None):
"""CORS preflight request handler"""
return Response(status=200)
# Uygulama başlangıcında security'yi initialize et
init_app_security()
# Global instances (lazy loading)
db = None
extractor = None
cleaner = None
def get_db():
"""Database instance'ı al (singleton)"""
global db
if db is None:
db = Database()
db.init_database()
return db
def get_extractor():
"""Transcript extractor instance'ı al"""
global extractor
if extractor is None:
extractor = TranscriptExtractor()
return extractor
def get_cleaner():
"""Transcript cleaner instance'ı al"""
global cleaner
if cleaner is None:
cleaner = TranscriptCleaner()
return cleaner
def normalize_channel_id(channel_id: Optional[str] = None,
channel: Optional[str] = None,
channel_url: Optional[str] = None) -> Optional[str]:
"""
Farklı formatlardan channel ID'yi normalize et
Args:
channel_id: Direkt Channel ID (UC...)
channel: Channel handle (@username) veya username
channel_url: Full YouTube channel URL
Returns:
Normalize edilmiş Channel ID veya None
"""
# Direkt Channel ID varsa
if channel_id:
if channel_id.startswith('UC') and len(channel_id) == 24:
return channel_id
# Eğer URL formatında ise parse et
if 'youtube.com/channel/' in channel_id:
parts = channel_id.split('/channel/')
if len(parts) > 1:
return parts[-1].split('?')[0].split('/')[0]
# Channel handle (@username)
if channel:
if not channel.startswith('@'):
channel = f"@{channel}"
handle_url = f"https://www.youtube.com/{channel}"
return get_channel_id_from_handle(handle_url)
# Channel URL
if channel_url:
# Handle URL
if '/@' in channel_url:
return get_channel_id_from_handle(channel_url)
# Channel ID URL
elif '/channel/' in channel_url:
parts = channel_url.split('/channel/')
if len(parts) > 1:
return parts[-1].split('?')[0].split('/')[0]
return None
def process_channel(channel_id: str, max_items: int = 50) -> dict:
"""
Kanal için transcript feed'i oluştur
Returns:
RSS feed string ve metadata
"""
db = get_db()
extractor = get_extractor()
cleaner = get_cleaner()
# RSS-Bridge'den videoları çek (max_items'ın 2 katı kadar çek, böylece yeterli video olur)
# RSS-Bridge'den daha fazla video çekiyoruz çünkü bazıları transcript'siz olabilir
rss_bridge_limit = max(max_items * 2, 50) # En az 50 video çek
logger.info(f"[PROCESS] Channel {channel_id} için RSS-Bridge'den video listesi çekiliyor (limit: {rss_bridge_limit})")
try:
videos = fetch_videos_from_rss_bridge(
base_url="https://rss-bridge.org/bridge01",
channel_id=channel_id,
format="Atom",
max_items=rss_bridge_limit
)
logger.info(f"[PROCESS] RSS-Bridge'den {len(videos)} video alındı")
except Exception as e:
logger.error(f"[PROCESS] ❌ RSS-Bridge hatası: {type(e).__name__} - {str(e)}")
raise Exception(f"RSS-Bridge hatası: {e}")
# Yeni videoları veritabanına ekle
new_videos_count = 0
for video in videos:
video['channel_id'] = channel_id
if not db.is_video_processed(video['video_id']):
db.add_video(video)
new_videos_count += 1
if new_videos_count > 0:
logger.info(f"[PROCESS] {new_videos_count} yeni video veritabanına eklendi")
else:
logger.debug(f"[PROCESS] Tüm videolar zaten veritabanında")
# Bekleyen videoları işle (max_items kadar, küçük batch'ler halinde)
# YouTube IP blocking'i önlemek için her batch'te sadece 5 video işlenir
# max_items: Her istekte kaç video transcript işleneceği (maksimum 100)
batch_size = 5 # Her batch'te işlenecek video sayısı (küçük batch = daha az blocking riski)
processed_count = 0 # İşlenen transcript sayısı
# Tüm bekleyen videoları al (channel_id'ye göre filtrele)
all_pending_videos = [v for v in db.get_pending_videos() if v['channel_id'] == channel_id]
logger.info(f"[PROCESS] Channel {channel_id} için {len(all_pending_videos)} bekleyen video bulundu (max_items: {max_items})")
# max_items kadar transcript işlenene kadar batch'ler halinde işle
total_batches = (len(all_pending_videos) + batch_size - 1) // batch_size
current_batch = 0
for batch_start in range(0, len(all_pending_videos), batch_size):
if processed_count >= max_items:
logger.info(f"[PROCESS] Maksimum transcript sayısına ulaşıldı ({processed_count}/{max_items})")
break
current_batch += 1
batch_videos = all_pending_videos[batch_start:batch_start + batch_size]
logger.info(f"[BATCH] Batch {current_batch}/{total_batches} başlatılıyor ({len(batch_videos)} video, Toplam işlenen: {processed_count}/{max_items})")
batch_processed = 0
batch_cached = 0
batch_failed = 0
for video in batch_videos:
if processed_count >= max_items:
break
video_id = video['video_id']
video_title = video.get('video_title', 'N/A')[:50]
# Cache kontrolü: 3 gün içinde işlenmiş transcript varsa atla
if db.is_transcript_cached(video_id, cache_days=3):
logger.debug(f"[CACHE] Video {video_id} ({video_title}) transcript'i cache'de, atlanıyor")
batch_cached += 1
continue
try:
logger.info(f"[VIDEO] Video işleniyor: {video_id} - {video_title}")
# Transcript çıkar
transcript = extractor.fetch_transcript(
video_id,
languages=['tr', 'en']
)
if transcript:
# Transcript temizle
logger.debug(f"[VIDEO] Video {video_id} transcript'i temizleniyor...")
raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3)
# Veritabanına kaydet (her batch hemen kaydedilir)
db.update_video_transcript(
video_id,
raw,
clean,
status=1,
language='tr'
)
processed_count += 1
batch_processed += 1
logger.info(f"[VIDEO] ✅ Video {video_id} başarıyla işlendi ve kaydedildi ({processed_count}/{max_items})")
else:
logger.warning(f"[VIDEO] ⚠️ Video {video_id} transcript'i alınamadı (None döndü)")
batch_failed += 1
db.mark_video_failed(video_id, "Transcript None döndü")
except Exception as e:
error_type = type(e).__name__
error_msg = str(e)[:200]
logger.error(f"[VIDEO] ❌ Video {video_id} işleme hatası: {error_type} - {error_msg}")
db.mark_video_failed(video_id, str(e))
batch_failed += 1
# Batch özeti
logger.info(f"[BATCH] Batch {current_batch}/{total_batches} tamamlandı - İşlenen: {batch_processed}, Cache: {batch_cached}, Başarısız: {batch_failed}")
# Batch tamamlandı, uzun bekleme (YouTube IP blocking önleme için)
if processed_count < max_items and batch_start + batch_size < len(all_pending_videos):
# Blocking varsa daha uzun bekle
wait_time = 60 + random.uniform(0, 30) # 60-90 saniye random (human-like)
logger.info(f"[BATCH] Batch'ler arası bekleme: {wait_time:.1f} saniye ({wait_time/60:.1f} dakika) - YouTube IP blocking önleme")
time.sleep(wait_time)
# İşlenmiş videoları getir
processed_videos = db.get_processed_videos(
limit=max_items,
channel_id=channel_id
)
logger.info(f"[PROCESS] ✅ Channel {channel_id} işleme tamamlandı - {len(processed_videos)} işlenmiş video döndürülüyor")
return {
'videos': processed_videos,
'channel_id': channel_id,
'count': len(processed_videos)
}
@app.route('/', methods=['GET'])
@require_api_key # API key zorunlu
@validate_input # Input validation
def generate_feed():
"""
RSS-Bridge benzeri URL template:
Örnekler:
- /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom
- /?channel=@tavakfi&format=Atom
- /?channel_url=https://www.youtube.com/@tavakfi&format=Atom
"""
# Query parametrelerini al (validate_input decorator zaten sanitize etti)
channel_id = request.args.get('channel_id')
channel = request.args.get('channel') # @username veya username
channel_url = request.args.get('channel_url')
format_type = request.args.get('format', 'Atom').lower() # Atom veya Rss
try:
max_items = int(request.args.get('max_items', 10)) # Default: 10 transcript
# Maksimum 100 transcript (20'şer batch'ler halinde işlenir)
max_items = min(max_items, 100)
except (ValueError, TypeError):
max_items = 10
# Channel ID'yi normalize et
normalized_channel_id = normalize_channel_id(
channel_id=channel_id,
channel=channel,
channel_url=channel_url
)
if not normalized_channel_id:
return jsonify({
'error': 'Channel ID bulunamadı',
'usage': {
'channel_id': 'UC... (YouTube Channel ID)',
'channel': '@username veya username',
'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...',
'format': 'Atom veya Rss (varsayılan: Atom)',
'max_items': 'Maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)'
}
}), 400
try:
# Kanalı işle
result = process_channel(normalized_channel_id, max_items=max_items)
if not result['videos']:
return jsonify({
'error': 'Henüz işlenmiş video yok',
'channel_id': normalized_channel_id,
'message': 'Transcript\'ler arka planda işleniyor. Lütfen birkaç dakika sonra tekrar deneyin.',
'note': 'YouTube IP blocking nedeniyle transcript çıkarımı yavaş olabilir. İlk istekte birkaç dakika bekleyin.'
}), 404
# RSS feed oluştur
channel_info = {
'id': normalized_channel_id,
'title': f"YouTube Transcript Feed - {normalized_channel_id}",
'link': f"https://www.youtube.com/channel/{normalized_channel_id}",
'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}',
'language': 'en'
}
generator = RSSGenerator(channel_info)
for video in result['videos']:
generator.add_video_entry(video)
# Format'a göre döndür
response_headers = {}
if hasattr(g, 'rate_limit_remaining'):
response_headers['X-RateLimit-Remaining'] = str(g.rate_limit_remaining)
if format_type == 'rss':
rss_content = generator.generate_rss_string()
response_headers['Content-Type'] = 'application/rss+xml; charset=utf-8'
return Response(
rss_content,
mimetype='application/rss+xml',
headers=response_headers
)
else: # Atom
# Feedgen Atom desteği
atom_content = generator.generate_atom_string()
response_headers['Content-Type'] = 'application/atom+xml; charset=utf-8'
return Response(
atom_content,
mimetype='application/atom+xml',
headers=response_headers
)
except Exception as e:
return jsonify({
'error': str(e),
'channel_id': normalized_channel_id
}), 500
@app.route('/health', methods=['GET'])
@rate_limit(limit_per_minute=120) # Health check için daha yüksek limit
def health():
"""Health check endpoint"""
return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'})
@app.route('/info', methods=['GET'])
@require_api_key # API key zorunlu
def info():
"""API bilgileri"""
return jsonify({
'service': 'YouTube Transcript RSS Feed Generator',
'version': '1.0.0',
'endpoints': {
'/': 'RSS Feed Generator',
'/health': 'Health Check',
'/info': 'API Info'
},
'usage': {
'channel_id': 'UC... (YouTube Channel ID)',
'channel': '@username veya username',
'channel_url': 'Full YouTube channel URL',
'format': 'Atom veya Rss (varsayılan: Atom)',
'max_items': 'Her istekte işlenecek maksimum transcript sayısı (varsayılan: 10, maksimum: 100, 20\'şer batch\'ler halinde işlenir)'
},
'examples': [
'/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom',
'/?channel=@tavakfi&format=Rss',
'/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=50'
]
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)