diff --git a/requirements.txt b/requirements.txt index b978eb0..8ad29cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ spacy>=3.7.0 pyyaml>=6.0.1 flask>=3.0.0 pytz>=2023.3 +yt-dlp>=2024.8.1 diff --git a/src/transcript_extractor.py b/src/transcript_extractor.py index 96cc4b6..2d9da85 100644 --- a/src/transcript_extractor.py +++ b/src/transcript_extractor.py @@ -15,6 +15,22 @@ except ImportError: except ImportError: # pragma: no cover - fallback for unexpected API changes class NoTranscriptAvailable(Exception): # type: ignore """Fallback exception when youtube_transcript_api does not expose NoTranscriptAvailable.""" + +try: + from youtube_transcript_api import YouTubeDataUnparsable # type: ignore +except ImportError: + try: + from youtube_transcript_api._errors import YouTubeDataUnparsable # type: ignore + except ImportError: # pragma: no cover + class YouTubeDataUnparsable(Exception): # type: ignore + """Fallback exception when transcript payload cannot be parsed.""" + +try: + import yt_dlp # type: ignore +except Exception: # pragma: no cover - optional dependency + yt_dlp = None + +import json from typing import List, Dict, Optional import time import logging @@ -247,6 +263,20 @@ class TranscriptExtractor: ) return transcript + except YouTubeDataUnparsable as e: + logger.error( + f"[TRANSCRIPT] ❌ Video {video_id} transcript çıkarımı başarısız (YouTubeDataUnparsable): {e}" + ) + fallback_transcript = self._fetch_transcript_with_yt_dlp(video_id, languages) + if fallback_transcript: + logger.info( + f"[TRANSCRIPT] 🔄 yt-dlp fallback ile transcript çıkarıldı ({len(fallback_transcript)} segment)" + ) + return fallback_transcript + logger.error( + f"[TRANSCRIPT] ❌ yt-dlp fallback da başarısız oldu: Video {video_id}" + ) + return None except (TranscriptsDisabled, NoTranscriptAvailable) as e: logger.error( f"[TRANSCRIPT] ❌ Video {video_id} için transcript devre dışı bırakılmış veya mevcut değil: {type(e).__name__} - {e}" @@ -276,3 +306,151 @@ class TranscriptExtractor: return None + def _fetch_transcript_with_yt_dlp(self, video_id: str, languages: List[str]) -> Optional[List[Dict]]: + """youtube-transcript-api başarısız olduğunda yt-dlp ile fallback transcript çıkar.""" + if yt_dlp is None: + logger.error("[YT-DLP] ❌ yt-dlp kütüphanesi yüklü değil, fallback kullanılamıyor") + return None + + lang_candidates: List[str] = [] + for lang in languages or []: + if lang and lang not in lang_candidates: + lang_candidates.append(lang) + if lang and "-" in lang: + base = lang.split("-", 1)[0] + if base and base not in lang_candidates: + lang_candidates.append(base) + + for fallback_lang in ["en", "en-US", "en-GB"]: + if fallback_lang not in lang_candidates: + lang_candidates.append(fallback_lang) + + ydl_opts = { + "skip_download": True, + "quiet": True, + "no_warnings": True, + "writesubtitles": False, + "writeautomaticsub": False, + "subtitleslangs": lang_candidates, + "subtitlesformat": "json3", + } + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[operator] + info = ydl.extract_info(video_id, download=False) + except Exception as e: + logger.error(f"[YT-DLP] ❌ Video {video_id} bilgileri alınamadı: {e}") + return None + + subtitles = info.get("subtitles") or {} + auto_subtitles = info.get("automatic_captions") or {} + + for lang in lang_candidates: + transcript = self._download_caption_entries(subtitles.get(lang)) + if transcript: + logger.info(f"[YT-DLP] ✅ Manuel altyazı bulundu (lang={lang})") + return transcript + + for lang in lang_candidates: + transcript = self._download_caption_entries(auto_subtitles.get(lang)) + if transcript: + logger.info(f"[YT-DLP] ✅ Otomatik altyazı bulundu (lang={lang})") + return transcript + + for caption_sources in (subtitles.values(), auto_subtitles.values()): + for entries in caption_sources: + transcript = self._download_caption_entries(entries) + if transcript: + logger.info("[YT-DLP] ✅ Farklı dilde transcript bulundu") + return transcript + + logger.warning(f"[YT-DLP] ⚠️ Video {video_id} için uygun transcript bulunamadı") + return None + + def _download_caption_entries(self, entries) -> Optional[List[Dict]]: + """yt-dlp'nin döndürdüğü altyazı girişlerini indirip transcript'e çevirir.""" + if not entries: + return None + + if not isinstance(entries, list): + entries = [entries] + + for entry in entries: + if not isinstance(entry, dict): + continue + url = entry.get("url") + if not url: + continue + + ext = entry.get("ext") + if ext not in {"json3", "srv3"}: + join_char = "&" if "?" in url else "?" + url = f"{url}{join_char}fmt=json3" + + try: + import requests + + response = requests.get(url, timeout=30) + response.raise_for_status() + except Exception as e: + logger.debug(f"[YT-DLP] ⚠️ Altyazı indirilemedi ({entry.get('ext')}): {e}") + continue + + try: + payload = response.json() + except ValueError: + try: + payload = json.loads(response.text) + except ValueError as e: + logger.debug(f"[YT-DLP] ⚠️ JSON parse edilemedi: {e}") + continue + + transcript = self._parse_json3_transcript(payload) + if transcript: + return transcript + + return None + + @staticmethod + def _parse_json3_transcript(payload: Dict) -> Optional[List[Dict]]: + """YouTube json3 altyazı formatını standart transcript listesine çevir.""" + if not isinstance(payload, dict): + return None + + events = payload.get("events") + if not isinstance(events, list): + return None + + transcript: List[Dict] = [] + for event in events: + if not isinstance(event, dict): + continue + segments = event.get("segs") or [] + if not isinstance(segments, list): + continue + + text_parts = [] + for seg in segments: + if isinstance(seg, dict): + text_parts.append(seg.get("utf8", "")) + text = "".join(text_parts).strip() + if not text: + continue + + start_ms = event.get("tStartMs") or 0 + duration_ms = event.get("dDurationMs") + if duration_ms is None: + duration_ms = event.get("tDurationMs", 0) + + start = float(start_ms) / 1000.0 + duration = float(duration_ms) / 1000.0 if duration_ms else 0.0 + + transcript.append( + { + "text": text, + "start": start, + "duration": duration, + } + ) + + return transcript or None