Youtube2Feed/src/video_fetcher.py

"""
RSS-Bridge kullanarak video metadata çıkarımı
"""
import feedparser
import re
import requests
from urllib.parse import urlencode
from typing import List, Dict, Optional
from datetime import datetime


def get_channel_id_from_handle(handle_url: str) -> Optional[str]:
    """
    Channel handle URL'inden Channel ID'yi web scraping ile bulur.
    Örnek: https://www.youtube.com/@tavakfi -> UC...
    """
    try:
        response = requests.get(handle_url)
        response.raise_for_status()

        html_content = response.text

        # İlk pattern: "externalId":"UC..."
        match = re.search(r'"externalId":"(UC[a-zA-Z0-9_-]{22})"', html_content)
        if match:
            return match.group(1)

        # Alternatif pattern: "channelId":"UC..."
        match_alt = re.search(r'"channelId":"(UC[a-zA-Z0-9_-]{22})"', html_content)
        if match_alt:
            return match_alt.group(1)

        return None

    except requests.exceptions.RequestException as e:
        raise Exception(f"Error fetching channel page: {e}")


def extract_video_id(url: str) -> Optional[str]:
    """YouTube URL'den video ID çıkar"""
    patterns = [
        r'youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})',
        r'youtu\.be/([a-zA-Z0-9_-]{11})',
        r'youtube\.com/embed/([a-zA-Z0-9_-]{11})'
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)

    return None


def fetch_videos_from_rss_bridge(base_url: str, channel_id: str,
                                 format: str = "Atom", max_items: int = 100) -> List[Dict]:
    """
    RSS-Bridge'den video listesini çek

    Args:
        base_url: RSS-Bridge base URL
        channel_id: YouTube Channel ID (UC...)
        format: Feed format (Atom veya Rss)
        max_items: Maksimum video sayısı

    Returns:
        Video metadata listesi
    """
    params = {
        'action': 'display',
        'bridge': 'YoutubeBridge',
        'context': 'By channel id',
        'c': channel_id,
        'format': format
    }

    feed_url = f"{base_url}/?{urlencode(params)}"

    try:
        feed = feedparser.parse(feed_url)

        videos = []
        for entry in feed.entries[:max_items]:
            video_id = extract_video_id(entry.link)
            if not video_id:
                continue

            # Tarih parsing
            published_date = None
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                published_date = datetime(*entry.published_parsed[:6]).isoformat() + 'Z'

            videos.append({
                'video_id': video_id,
                'video_title': entry.title,
                'video_url': entry.link,
                'published_at_utc': published_date,
                'description': getattr(entry, 'summary', '')
            })

        return videos

    except Exception as e:
        raise Exception(f"Error fetching RSS-Bridge feed: {e}")