sync: auto-commit uncommitted changes

This commit is contained in:
salvacybersec
2026-02-25 16:22:33 +03:00
parent e52708cda9
commit c22b33f37d
2 changed files with 179 additions and 0 deletions

View File

@@ -7,4 +7,5 @@ spacy>=3.7.0
pyyaml>=6.0.1
flask>=3.0.0
pytz>=2023.3
yt-dlp>=2024.8.1

View File

@@ -15,6 +15,22 @@ except ImportError:
except ImportError: # pragma: no cover - fallback for unexpected API changes
class NoTranscriptAvailable(Exception): # type: ignore
"""Fallback exception when youtube_transcript_api does not expose NoTranscriptAvailable."""
try:
from youtube_transcript_api import YouTubeDataUnparsable # type: ignore
except ImportError:
try:
from youtube_transcript_api._errors import YouTubeDataUnparsable # type: ignore
except ImportError: # pragma: no cover
class YouTubeDataUnparsable(Exception): # type: ignore
"""Fallback exception when transcript payload cannot be parsed."""
try:
import yt_dlp # type: ignore
except Exception: # pragma: no cover - optional dependency
yt_dlp = None
import json
from typing import List, Dict, Optional
import time
import logging
@@ -247,6 +263,20 @@ class TranscriptExtractor:
)
return transcript
except YouTubeDataUnparsable as e:
logger.error(
f"[TRANSCRIPT] ❌ Video {video_id} transcript çıkarımı başarısız (YouTubeDataUnparsable): {e}"
)
fallback_transcript = self._fetch_transcript_with_yt_dlp(video_id, languages)
if fallback_transcript:
logger.info(
f"[TRANSCRIPT] 🔄 yt-dlp fallback ile transcript çıkarıldı ({len(fallback_transcript)} segment)"
)
return fallback_transcript
logger.error(
f"[TRANSCRIPT] ❌ yt-dlp fallback da başarısız oldu: Video {video_id}"
)
return None
except (TranscriptsDisabled, NoTranscriptAvailable) as e:
logger.error(
f"[TRANSCRIPT] ❌ Video {video_id} için transcript devre dışı bırakılmış veya mevcut değil: {type(e).__name__} - {e}"
@@ -276,3 +306,151 @@ class TranscriptExtractor:
return None
def _fetch_transcript_with_yt_dlp(self, video_id: str, languages: List[str]) -> Optional[List[Dict]]:
"""youtube-transcript-api başarısız olduğunda yt-dlp ile fallback transcript çıkar."""
if yt_dlp is None:
logger.error("[YT-DLP] ❌ yt-dlp kütüphanesi yüklü değil, fallback kullanılamıyor")
return None
lang_candidates: List[str] = []
for lang in languages or []:
if lang and lang not in lang_candidates:
lang_candidates.append(lang)
if lang and "-" in lang:
base = lang.split("-", 1)[0]
if base and base not in lang_candidates:
lang_candidates.append(base)
for fallback_lang in ["en", "en-US", "en-GB"]:
if fallback_lang not in lang_candidates:
lang_candidates.append(fallback_lang)
ydl_opts = {
"skip_download": True,
"quiet": True,
"no_warnings": True,
"writesubtitles": False,
"writeautomaticsub": False,
"subtitleslangs": lang_candidates,
"subtitlesformat": "json3",
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[operator]
info = ydl.extract_info(video_id, download=False)
except Exception as e:
logger.error(f"[YT-DLP] ❌ Video {video_id} bilgileri alınamadı: {e}")
return None
subtitles = info.get("subtitles") or {}
auto_subtitles = info.get("automatic_captions") or {}
for lang in lang_candidates:
transcript = self._download_caption_entries(subtitles.get(lang))
if transcript:
logger.info(f"[YT-DLP] ✅ Manuel altyazı bulundu (lang={lang})")
return transcript
for lang in lang_candidates:
transcript = self._download_caption_entries(auto_subtitles.get(lang))
if transcript:
logger.info(f"[YT-DLP] ✅ Otomatik altyazı bulundu (lang={lang})")
return transcript
for caption_sources in (subtitles.values(), auto_subtitles.values()):
for entries in caption_sources:
transcript = self._download_caption_entries(entries)
if transcript:
logger.info("[YT-DLP] ✅ Farklı dilde transcript bulundu")
return transcript
logger.warning(f"[YT-DLP] ⚠️ Video {video_id} için uygun transcript bulunamadı")
return None
def _download_caption_entries(self, entries) -> Optional[List[Dict]]:
"""yt-dlp'nin döndürdüğü altyazı girişlerini indirip transcript'e çevirir."""
if not entries:
return None
if not isinstance(entries, list):
entries = [entries]
for entry in entries:
if not isinstance(entry, dict):
continue
url = entry.get("url")
if not url:
continue
ext = entry.get("ext")
if ext not in {"json3", "srv3"}:
join_char = "&" if "?" in url else "?"
url = f"{url}{join_char}fmt=json3"
try:
import requests
response = requests.get(url, timeout=30)
response.raise_for_status()
except Exception as e:
logger.debug(f"[YT-DLP] ⚠️ Altyazı indirilemedi ({entry.get('ext')}): {e}")
continue
try:
payload = response.json()
except ValueError:
try:
payload = json.loads(response.text)
except ValueError as e:
logger.debug(f"[YT-DLP] ⚠️ JSON parse edilemedi: {e}")
continue
transcript = self._parse_json3_transcript(payload)
if transcript:
return transcript
return None
@staticmethod
def _parse_json3_transcript(payload: Dict) -> Optional[List[Dict]]:
"""YouTube json3 altyazı formatını standart transcript listesine çevir."""
if not isinstance(payload, dict):
return None
events = payload.get("events")
if not isinstance(events, list):
return None
transcript: List[Dict] = []
for event in events:
if not isinstance(event, dict):
continue
segments = event.get("segs") or []
if not isinstance(segments, list):
continue
text_parts = []
for seg in segments:
if isinstance(seg, dict):
text_parts.append(seg.get("utf8", ""))
text = "".join(text_parts).strip()
if not text:
continue
start_ms = event.get("tStartMs") or 0
duration_ms = event.get("dDurationMs")
if duration_ms is None:
duration_ms = event.get("tDurationMs", 0)
start = float(start_ms) / 1000.0
duration = float(duration_ms) / 1000.0 if duration_ms else 0.0
transcript.append(
{
"text": text,
"start": start,
"duration": duration,
}
)
return transcript or None