diff --git a/config/config.yaml b/config/config.yaml index 2a6929b..d9a6ce6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -28,6 +28,12 @@ transcript: enable_sbd: true paragraph_length: 3 +# FlareSolverr ayarları (YouTube IP blocking önleme için) +flaresolverr: + # FlareSolverr API URL'i (devre dışı bırakmak için null veya boş bırakın) + url: "http://192.168.1.27:8191/v1" + # veya environment variable kullan: FLARESOLVERR_URL=http://192.168.1.27:8191/v1 + rss: title: "Channel Transcript Feed" description: "Full-text transcript RSS feed" diff --git a/src/transcript_extractor.py b/src/transcript_extractor.py index b6ab43b..4929d68 100644 --- a/src/transcript_extractor.py +++ b/src/transcript_extractor.py @@ -33,11 +33,13 @@ USER_AGENTS = [ class TranscriptExtractor: """YouTube transcript çıkarıcı sınıfı""" - def __init__(self, rate_limit: int = 1, time_window: int = 60): + def __init__(self, rate_limit: int = 1, time_window: int = 60, + flaresolverr_url: Optional[str] = None): """ Args: rate_limit: Zaman penceresi başına maksimum istek sayısı (YouTube IP blocking'i önlemek için çok düşük) time_window: Zaman penceresi (saniye) - daha uzun süre + flaresolverr_url: FlareSolverr API URL'i (örn: http://192.168.1.27:8191/v1) """ self.rate_limit = rate_limit # 1 istek/60 saniye (çok konservatif) self.time_window = time_window @@ -45,36 +47,187 @@ class TranscriptExtractor: self.last_blocked_time = 0 self.block_count = 0 # Toplam blocking sayısı + # FlareSolverr ayarları + self.flaresolverr_url = flaresolverr_url or os.getenv('FLARESOLVERR_URL', 'http://192.168.1.27:8191/v1') + self.use_flaresolverr = bool(self.flaresolverr_url) + + if self.use_flaresolverr: + logger.info(f"[FLARESOLVERR] FlareSolverr etkin: {self.flaresolverr_url}") + self._test_flaresolverr() + else: + logger.info("[FLARESOLVERR] FlareSolverr devre dışı") + # Gerçek tarayıcı header'larını ayarla self._setup_browser_headers() + def _test_flaresolverr(self): + """FlareSolverr bağlantısını test et""" + try: + import requests + + test_response = requests.get(f"{self.flaresolverr_url.replace('/v1', '')}/v1", timeout=5) + if test_response.status_code == 405: # Method Not Allowed normal (GET yerine POST bekliyor) + logger.info("[FLARESOLVERR] ✅ FlareSolverr erişilebilir") + return True + else: + logger.warning(f"[FLARESOLVERR] ⚠️ FlareSolverr yanıtı beklenmedik: {test_response.status_code}") + return False + except Exception as e: + logger.warning(f"[FLARESOLVERR] ⚠️ FlareSolverr test edilemedi: {e}") + logger.warning(f"[FLARESOLVERR] FlareSolverr devre dışı bırakılıyor") + self.use_flaresolverr = False + return False + + def _make_flaresolverr_request(self, url: str, method: str = 'GET', **kwargs) -> Optional: + """FlareSolverr üzerinden istek yap""" + if not self.use_flaresolverr: + return None + + try: + import requests + import json + + # FlareSolverr API isteği + flaresolverr_payload = { + "cmd": "request.get", + "url": url, + "maxTimeout": 60000, # 60 saniye timeout + } + + # Header'ları ekle + headers = kwargs.get('headers', {}) + if headers: + flaresolverr_payload["headers"] = headers + + logger.debug(f"[FLARESOLVERR] İstek gönderiliyor: {url[:50]}...") + + response = requests.post( + self.flaresolverr_url, + json=flaresolverr_payload, + timeout=65 # FlareSolverr timeout'undan biraz fazla + ) + + if response.status_code == 200: + result = response.json() + if result.get('status') == 'ok': + solution = result.get('solution', {}) + # FlareSolverr response formatı: solution.status HTTP status code, solution.response HTML içerik + status_code = solution.get('status', 200) + html = solution.get('response', '') + headers = solution.get('headers', {}) + + # Response objesi oluştur (requests.Response benzeri) + class FlareSolverrResponse: + def __init__(self, status_code, text, headers, url): + self.status_code = status_code + self.text = text + self.content = text.encode('utf-8') if isinstance(text, str) else text + self.headers = headers if headers else {} + self.url = url + self.ok = 200 <= status_code < 300 + + def json(self): + import json + try: + return json.loads(self.text) + except: + return {} + + logger.info(f"[FLARESOLVERR] ✅ İstek başarılı: HTTP {status_code}, {len(html)} byte içerik") + return FlareSolverrResponse(status_code, html, headers, url) + else: + error = result.get('message', 'Unknown error') + logger.error(f"[FLARESOLVERR] ❌ FlareSolverr hatası: {error}") + return None + else: + logger.error(f"[FLARESOLVERR] ❌ FlareSolverr HTTP hatası: {response.status_code}") + return None + + except Exception as e: + logger.error(f"[FLARESOLVERR] ❌ FlareSolverr istek hatası: {type(e).__name__} - {str(e)[:200]}") + return None + def _setup_browser_headers(self): """Gerçek tarayıcı gibi HTTP header'larını ayarla""" try: import requests + # TranscriptExtractor instance'ını sakla (closure için) + extractor_instance = self + # requests.Session'ın get/post metodlarını patch et # Bu, youtube-transcript-api'nin yaptığı tüm isteklere header ekler original_get = requests.Session.get original_post = requests.Session.post - def patched_get(self, url, **kwargs): - """requests.Session.get'i patch et - header'ları ekle""" + def patched_get(session_self, url, **kwargs): + """requests.Session.get'i patch et - header'ları ekle ve FlareSolverr kullan""" + # FlareSolverr kullanılıyorsa ve YouTube URL'si ise + if extractor_instance.use_flaresolverr and ('youtube.com' in url or 'youtu.be' in url): + logger.info(f"[FLARESOLVERR] YouTube isteği FlareSolverr üzerinden: {url[:50]}...") + flaresolverr_response = extractor_instance._make_flaresolverr_request(url, 'GET', **kwargs) + if flaresolverr_response: + # FlareSolverr response'unu requests.Response'a benzet + class PatchedResponse: + def __init__(self, flaresolverr_response): + self.status_code = flaresolverr_response.status_code + self.text = flaresolverr_response.text + self.content = flaresolverr_response.content + self.headers = flaresolverr_response.headers + self.url = flaresolverr_response.url + self.ok = 200 <= self.status_code < 300 + + def json(self): + import json + try: + return json.loads(self.text) + except: + return {} + + return PatchedResponse(flaresolverr_response) + else: + logger.warning(f"[FLARESOLVERR] FlareSolverr yanıt vermedi, normal istek deneniyor") + + # Normal istek (header'ları ekle) headers = kwargs.get('headers', {}) browser_headers = TranscriptExtractor._get_browser_headers() - # Mevcut header'ları koru, browser header'larını ekle merged_headers = {**browser_headers, **headers} kwargs['headers'] = merged_headers - return original_get(self, url, **kwargs) + return original_get(session_self, url, **kwargs) - def patched_post(self, url, **kwargs): - """requests.Session.post'i patch et - header'ları ekle""" + def patched_post(session_self, url, **kwargs): + """requests.Session.post'i patch et - header'ları ekle ve FlareSolverr kullan""" + # FlareSolverr kullanılıyorsa ve YouTube URL'si ise + if extractor_instance.use_flaresolverr and ('youtube.com' in url or 'youtu.be' in url): + logger.info(f"[FLARESOLVERR] YouTube POST isteği FlareSolverr üzerinden: {url[:50]}...") + flaresolverr_response = extractor_instance._make_flaresolverr_request(url, 'POST', **kwargs) + if flaresolverr_response: + class PatchedResponse: + def __init__(self, flaresolverr_response): + self.status_code = flaresolverr_response.status_code + self.text = flaresolverr_response.text + self.content = flaresolverr_response.content + self.headers = flaresolverr_response.headers + self.url = flaresolverr_response.url + self.ok = 200 <= self.status_code < 300 + + def json(self): + import json + try: + return json.loads(self.text) + except: + return {} + + return PatchedResponse(flaresolverr_response) + else: + logger.warning(f"[FLARESOLVERR] FlareSolverr yanıt vermedi, normal istek deneniyor") + + # Normal istek (header'ları ekle) headers = kwargs.get('headers', {}) browser_headers = TranscriptExtractor._get_browser_headers() - # Mevcut header'ları koru, browser header'larını ekle merged_headers = {**browser_headers, **headers} kwargs['headers'] = merged_headers - return original_post(self, url, **kwargs) + return original_post(session_self, url, **kwargs) # Patch'i uygula (sadece bir kez) if not hasattr(requests.Session, '_browser_headers_patched'): @@ -83,6 +236,8 @@ class TranscriptExtractor: requests.Session._browser_headers_patched = True logger.info("[HEADERS] ✅ requests.Session patch edildi - Gerçek tarayıcı header'ları eklendi") + if self.use_flaresolverr: + logger.info("[FLARESOLVERR] ✅ FlareSolverr desteği eklendi") except Exception as e: logger.warning(f"[HEADERS] ⚠️ Header patch edilemedi: {e}") diff --git a/src/web_server.py b/src/web_server.py index 940ff86..341e031 100644 --- a/src/web_server.py +++ b/src/web_server.py @@ -114,7 +114,21 @@ def get_extractor(): """Transcript extractor instance'ı al""" global extractor if extractor is None: - extractor = TranscriptExtractor() + # FlareSolverr URL'ini config'den veya environment variable'dan al + flaresolverr_url = os.getenv('FLARESOLVERR_URL', 'http://192.168.1.27:8191/v1') + + # Config dosyasından oku (eğer varsa) + try: + config_path = Path(__file__).parent.parent / 'config' / 'config.yaml' + if config_path.exists(): + with open(config_path, 'r', encoding='utf-8') as f: + config_data = yaml.safe_load(f) + if config_data and 'flaresolverr' in config_data: + flaresolverr_url = config_data['flaresolverr'].get('url', flaresolverr_url) + except Exception as e: + logger.debug(f"Config dosyası okunamadı: {e}") + + extractor = TranscriptExtractor(flaresolverr_url=flaresolverr_url) return extractor