Youtube2Feed/src/video_fetcher.py

"""
RSS-Bridge kullanarak video metadata çıkarımı
"""
import feedparser
import re
import requests
from urllib.parse import urlencode
from typing import List, Dict, Optional
from datetime import datetime


def get_channel_id_from_handle(handle_url: str) -> Optional[str]:
    """
    Channel handle URL'inden Channel ID'yi web scraping ile bulur.
    Örnek: https://www.youtube.com/@tavakfi -> UC...
    """
    try:
        response = requests.get(handle_url)
        response.raise_for_status()
        
        html_content = response.text
        
        # İlk pattern: "externalId":"UC..."
        match = re.search(r'"externalId":"(UC[a-zA-Z0-9_-]{22})"', html_content)
        if match:
            return match.group(1)
        
        # Alternatif pattern: "channelId":"UC..."
        match_alt = re.search(r'"channelId":"(UC[a-zA-Z0-9_-]{22})"', html_content)
        if match_alt:
            return match_alt.group(1)
        
        return None
        
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error fetching channel page: {e}")


def extract_video_id(url: str) -> Optional[str]:
    """YouTube URL'den video ID çıkar"""
    patterns = [
        r'youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})',
        r'youtu\.be/([a-zA-Z0-9_-]{11})',
        r'youtube\.com/embed/([a-zA-Z0-9_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    
    return None


def fetch_videos_from_rss_bridge(base_url: str, channel_id: str, 
                                 format: str = "Atom", max_items: int = 100) -> List[Dict]:
    """
    RSS-Bridge'den video listesini çek
    
    Args:
        base_url: RSS-Bridge base URL
        channel_id: YouTube Channel ID (UC...)
        format: Feed format (Atom veya Rss)
        max_items: Maksimum video sayısı
    
    Returns:
        Video metadata listesi
    """
    params = {
        'action': 'display',
        'bridge': 'YoutubeBridge',
        'context': 'By channel id',
        'c': channel_id,
        'format': format
    }
    
    feed_url = f"{base_url}/?{urlencode(params)}"
    
    try:
        feed = feedparser.parse(feed_url)
        
        videos = []
        for entry in feed.entries[:max_items]:
            video_id = extract_video_id(entry.link)
            if not video_id:
                continue
            
            # Tarih parsing
            published_date = None
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                published_date = datetime(*entry.published_parsed[:6]).isoformat() + 'Z'
            
            videos.append({
                'video_id': video_id,
                'video_title': entry.title,
                'video_url': entry.link,
                'published_at_utc': published_date,
                'description': getattr(entry, 'summary', '')
            })
        
        return videos
        
    except Exception as e:
        raise Exception(f"Error fetching RSS-Bridge feed: {e}")