sync: auto-commit uncommitted changes
This commit is contained in:
@@ -7,4 +7,5 @@ spacy>=3.7.0
|
||||
pyyaml>=6.0.1
|
||||
flask>=3.0.0
|
||||
pytz>=2023.3
|
||||
yt-dlp>=2024.8.1
|
||||
|
||||
|
||||
@@ -15,6 +15,22 @@ except ImportError:
|
||||
except ImportError: # pragma: no cover - fallback for unexpected API changes
|
||||
class NoTranscriptAvailable(Exception): # type: ignore
|
||||
"""Fallback exception when youtube_transcript_api does not expose NoTranscriptAvailable."""
|
||||
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeDataUnparsable # type: ignore
|
||||
except ImportError:
|
||||
try:
|
||||
from youtube_transcript_api._errors import YouTubeDataUnparsable # type: ignore
|
||||
except ImportError: # pragma: no cover
|
||||
class YouTubeDataUnparsable(Exception): # type: ignore
|
||||
"""Fallback exception when transcript payload cannot be parsed."""
|
||||
|
||||
try:
|
||||
import yt_dlp # type: ignore
|
||||
except Exception: # pragma: no cover - optional dependency
|
||||
yt_dlp = None
|
||||
|
||||
import json
|
||||
from typing import List, Dict, Optional
|
||||
import time
|
||||
import logging
|
||||
@@ -247,6 +263,20 @@ class TranscriptExtractor:
|
||||
)
|
||||
|
||||
return transcript
|
||||
except YouTubeDataUnparsable as e:
|
||||
logger.error(
|
||||
f"[TRANSCRIPT] ❌ Video {video_id} transcript çıkarımı başarısız (YouTubeDataUnparsable): {e}"
|
||||
)
|
||||
fallback_transcript = self._fetch_transcript_with_yt_dlp(video_id, languages)
|
||||
if fallback_transcript:
|
||||
logger.info(
|
||||
f"[TRANSCRIPT] 🔄 yt-dlp fallback ile transcript çıkarıldı ({len(fallback_transcript)} segment)"
|
||||
)
|
||||
return fallback_transcript
|
||||
logger.error(
|
||||
f"[TRANSCRIPT] ❌ yt-dlp fallback da başarısız oldu: Video {video_id}"
|
||||
)
|
||||
return None
|
||||
except (TranscriptsDisabled, NoTranscriptAvailable) as e:
|
||||
logger.error(
|
||||
f"[TRANSCRIPT] ❌ Video {video_id} için transcript devre dışı bırakılmış veya mevcut değil: {type(e).__name__} - {e}"
|
||||
@@ -276,3 +306,151 @@ class TranscriptExtractor:
|
||||
|
||||
return None
|
||||
|
||||
def _fetch_transcript_with_yt_dlp(self, video_id: str, languages: List[str]) -> Optional[List[Dict]]:
|
||||
"""youtube-transcript-api başarısız olduğunda yt-dlp ile fallback transcript çıkar."""
|
||||
if yt_dlp is None:
|
||||
logger.error("[YT-DLP] ❌ yt-dlp kütüphanesi yüklü değil, fallback kullanılamıyor")
|
||||
return None
|
||||
|
||||
lang_candidates: List[str] = []
|
||||
for lang in languages or []:
|
||||
if lang and lang not in lang_candidates:
|
||||
lang_candidates.append(lang)
|
||||
if lang and "-" in lang:
|
||||
base = lang.split("-", 1)[0]
|
||||
if base and base not in lang_candidates:
|
||||
lang_candidates.append(base)
|
||||
|
||||
for fallback_lang in ["en", "en-US", "en-GB"]:
|
||||
if fallback_lang not in lang_candidates:
|
||||
lang_candidates.append(fallback_lang)
|
||||
|
||||
ydl_opts = {
|
||||
"skip_download": True,
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"writesubtitles": False,
|
||||
"writeautomaticsub": False,
|
||||
"subtitleslangs": lang_candidates,
|
||||
"subtitlesformat": "json3",
|
||||
}
|
||||
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[operator]
|
||||
info = ydl.extract_info(video_id, download=False)
|
||||
except Exception as e:
|
||||
logger.error(f"[YT-DLP] ❌ Video {video_id} bilgileri alınamadı: {e}")
|
||||
return None
|
||||
|
||||
subtitles = info.get("subtitles") or {}
|
||||
auto_subtitles = info.get("automatic_captions") or {}
|
||||
|
||||
for lang in lang_candidates:
|
||||
transcript = self._download_caption_entries(subtitles.get(lang))
|
||||
if transcript:
|
||||
logger.info(f"[YT-DLP] ✅ Manuel altyazı bulundu (lang={lang})")
|
||||
return transcript
|
||||
|
||||
for lang in lang_candidates:
|
||||
transcript = self._download_caption_entries(auto_subtitles.get(lang))
|
||||
if transcript:
|
||||
logger.info(f"[YT-DLP] ✅ Otomatik altyazı bulundu (lang={lang})")
|
||||
return transcript
|
||||
|
||||
for caption_sources in (subtitles.values(), auto_subtitles.values()):
|
||||
for entries in caption_sources:
|
||||
transcript = self._download_caption_entries(entries)
|
||||
if transcript:
|
||||
logger.info("[YT-DLP] ✅ Farklı dilde transcript bulundu")
|
||||
return transcript
|
||||
|
||||
logger.warning(f"[YT-DLP] ⚠️ Video {video_id} için uygun transcript bulunamadı")
|
||||
return None
|
||||
|
||||
def _download_caption_entries(self, entries) -> Optional[List[Dict]]:
|
||||
"""yt-dlp'nin döndürdüğü altyazı girişlerini indirip transcript'e çevirir."""
|
||||
if not entries:
|
||||
return None
|
||||
|
||||
if not isinstance(entries, list):
|
||||
entries = [entries]
|
||||
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
url = entry.get("url")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
ext = entry.get("ext")
|
||||
if ext not in {"json3", "srv3"}:
|
||||
join_char = "&" if "?" in url else "?"
|
||||
url = f"{url}{join_char}fmt=json3"
|
||||
|
||||
try:
|
||||
import requests
|
||||
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.debug(f"[YT-DLP] ⚠️ Altyazı indirilemedi ({entry.get('ext')}): {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
payload = response.json()
|
||||
except ValueError:
|
||||
try:
|
||||
payload = json.loads(response.text)
|
||||
except ValueError as e:
|
||||
logger.debug(f"[YT-DLP] ⚠️ JSON parse edilemedi: {e}")
|
||||
continue
|
||||
|
||||
transcript = self._parse_json3_transcript(payload)
|
||||
if transcript:
|
||||
return transcript
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_json3_transcript(payload: Dict) -> Optional[List[Dict]]:
|
||||
"""YouTube json3 altyazı formatını standart transcript listesine çevir."""
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
|
||||
events = payload.get("events")
|
||||
if not isinstance(events, list):
|
||||
return None
|
||||
|
||||
transcript: List[Dict] = []
|
||||
for event in events:
|
||||
if not isinstance(event, dict):
|
||||
continue
|
||||
segments = event.get("segs") or []
|
||||
if not isinstance(segments, list):
|
||||
continue
|
||||
|
||||
text_parts = []
|
||||
for seg in segments:
|
||||
if isinstance(seg, dict):
|
||||
text_parts.append(seg.get("utf8", ""))
|
||||
text = "".join(text_parts).strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
start_ms = event.get("tStartMs") or 0
|
||||
duration_ms = event.get("dDurationMs")
|
||||
if duration_ms is None:
|
||||
duration_ms = event.get("tDurationMs", 0)
|
||||
|
||||
start = float(start_ms) / 1000.0
|
||||
duration = float(duration_ms) / 1000.0 if duration_ms else 0.0
|
||||
|
||||
transcript.append(
|
||||
{
|
||||
"text": text,
|
||||
"start": start,
|
||||
"duration": duration,
|
||||
}
|
||||
)
|
||||
|
||||
return transcript or None
|
||||
|
||||
Reference in New Issue
Block a user