sync: auto-commit uncommitted changes

2026-02-25 16:22:33 +03:00
parent e52708cda9
commit c22b33f37d
2 changed files with 179 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ spacy>=3.7.0
 pyyaml>=6.0.1
 flask>=3.0.0
 pytz>=2023.3
+yt-dlp>=2024.8.1

--- a/src/transcript_extractor.py
+++ b/src/transcript_extractor.py
@@ -15,6 +15,22 @@ except ImportError:
    except ImportError:  # pragma: no cover - fallback for unexpected API changes
        class NoTranscriptAvailable(Exception):  # type: ignore
            """Fallback exception when youtube_transcript_api does not expose NoTranscriptAvailable."""
+
+try:
+    from youtube_transcript_api import YouTubeDataUnparsable  # type: ignore
+except ImportError:
+    try:
+        from youtube_transcript_api._errors import YouTubeDataUnparsable  # type: ignore
+    except ImportError:  # pragma: no cover
+        class YouTubeDataUnparsable(Exception):  # type: ignore
+            """Fallback exception when transcript payload cannot be parsed."""
+
+try:
+    import yt_dlp  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    yt_dlp = None
+
+import json
 from typing import List, Dict, Optional
 import time
 import logging
@@ -247,6 +263,20 @@ class TranscriptExtractor:
            )

            return transcript
+        except YouTubeDataUnparsable as e:
+            logger.error(
+                f"[TRANSCRIPT] ❌ Video {video_id} transcript çıkarımı başarısız (YouTubeDataUnparsable): {e}"
+            )
+            fallback_transcript = self._fetch_transcript_with_yt_dlp(video_id, languages)
+            if fallback_transcript:
+                logger.info(
+                    f"[TRANSCRIPT] 🔄 yt-dlp fallback ile transcript çıkarıldı ({len(fallback_transcript)} segment)"
+                )
+                return fallback_transcript
+            logger.error(
+                f"[TRANSCRIPT] ❌ yt-dlp fallback da başarısız oldu: Video {video_id}"
+            )
+            return None
        except (TranscriptsDisabled, NoTranscriptAvailable) as e:
            logger.error(
                f"[TRANSCRIPT] ❌ Video {video_id} için transcript devre dışı bırakılmış veya mevcut değil: {type(e).__name__} - {e}"
@@ -276,3 +306,151 @@ class TranscriptExtractor:

            return None

+    def _fetch_transcript_with_yt_dlp(self, video_id: str, languages: List[str]) -> Optional[List[Dict]]:
+        """youtube-transcript-api başarısız olduğunda yt-dlp ile fallback transcript çıkar."""
+        if yt_dlp is None:
+            logger.error("[YT-DLP] ❌ yt-dlp kütüphanesi yüklü değil, fallback kullanılamıyor")
+            return None
+
+        lang_candidates: List[str] = []
+        for lang in languages or []:
+            if lang and lang not in lang_candidates:
+                lang_candidates.append(lang)
+            if lang and "-" in lang:
+                base = lang.split("-", 1)[0]
+                if base and base not in lang_candidates:
+                    lang_candidates.append(base)
+
+        for fallback_lang in ["en", "en-US", "en-GB"]:
+            if fallback_lang not in lang_candidates:
+                lang_candidates.append(fallback_lang)
+
+        ydl_opts = {
+            "skip_download": True,
+            "quiet": True,
+            "no_warnings": True,
+            "writesubtitles": False,
+            "writeautomaticsub": False,
+            "subtitleslangs": lang_candidates,
+            "subtitlesformat": "json3",
+        }
+
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[operator]
+                info = ydl.extract_info(video_id, download=False)
+        except Exception as e:
+            logger.error(f"[YT-DLP] ❌ Video {video_id} bilgileri alınamadı: {e}")
+            return None
+
+        subtitles = info.get("subtitles") or {}
+        auto_subtitles = info.get("automatic_captions") or {}
+
+        for lang in lang_candidates:
+            transcript = self._download_caption_entries(subtitles.get(lang))
+            if transcript:
+                logger.info(f"[YT-DLP] ✅ Manuel altyazı bulundu (lang={lang})")
+                return transcript
+
+        for lang in lang_candidates:
+            transcript = self._download_caption_entries(auto_subtitles.get(lang))
+            if transcript:
+                logger.info(f"[YT-DLP] ✅ Otomatik altyazı bulundu (lang={lang})")
+                return transcript
+
+        for caption_sources in (subtitles.values(), auto_subtitles.values()):
+            for entries in caption_sources:
+                transcript = self._download_caption_entries(entries)
+                if transcript:
+                    logger.info("[YT-DLP] ✅ Farklı dilde transcript bulundu")
+                    return transcript
+
+        logger.warning(f"[YT-DLP] ⚠️ Video {video_id} için uygun transcript bulunamadı")
+        return None
+
+    def _download_caption_entries(self, entries) -> Optional[List[Dict]]:
+        """yt-dlp'nin döndürdüğü altyazı girişlerini indirip transcript'e çevirir."""
+        if not entries:
+            return None
+
+        if not isinstance(entries, list):
+            entries = [entries]
+
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            url = entry.get("url")
+            if not url:
+                continue
+
+            ext = entry.get("ext")
+            if ext not in {"json3", "srv3"}:
+                join_char = "&" if "?" in url else "?"
+                url = f"{url}{join_char}fmt=json3"
+
+            try:
+                import requests
+
+                response = requests.get(url, timeout=30)
+                response.raise_for_status()
+            except Exception as e:
+                logger.debug(f"[YT-DLP] ⚠️ Altyazı indirilemedi ({entry.get('ext')}): {e}")
+                continue
+
+            try:
+                payload = response.json()
+            except ValueError:
+                try:
+                    payload = json.loads(response.text)
+                except ValueError as e:
+                    logger.debug(f"[YT-DLP] ⚠️ JSON parse edilemedi: {e}")
+                    continue
+
+            transcript = self._parse_json3_transcript(payload)
+            if transcript:
+                return transcript
+
+        return None
+
+    @staticmethod
+    def _parse_json3_transcript(payload: Dict) -> Optional[List[Dict]]:
+        """YouTube json3 altyazı formatını standart transcript listesine çevir."""
+        if not isinstance(payload, dict):
+            return None
+
+        events = payload.get("events")
+        if not isinstance(events, list):
+            return None
+
+        transcript: List[Dict] = []
+        for event in events:
+            if not isinstance(event, dict):
+                continue
+            segments = event.get("segs") or []
+            if not isinstance(segments, list):
+                continue
+
+            text_parts = []
+            for seg in segments:
+                if isinstance(seg, dict):
+                    text_parts.append(seg.get("utf8", ""))
+            text = "".join(text_parts).strip()
+            if not text:
+                continue
+
+            start_ms = event.get("tStartMs") or 0
+            duration_ms = event.get("dDurationMs")
+            if duration_ms is None:
+                duration_ms = event.get("tDurationMs", 0)
+
+            start = float(start_ms) / 1000.0
+            duration = float(duration_ms) / 1000.0 if duration_ms else 0.0
+
+            transcript.append(
+                {
+                    "text": text,
+                    "start": start,
+                    "duration": duration,
+                }
+            )
+
+        return transcript or None