Files
Youtube2Feed/main.py
salvacybersec abe170a1f8 first commit
2025-11-13 03:25:21 +03:00

164 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
YouTube Transcript RSS Feed Generator - Ana Pipeline
"""
import yaml
import os
import sys
from pathlib import Path
# Proje root'unu path'e ekle
sys.path.insert(0, str(Path(__file__).parent))
from src.database import Database
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle
from src.transcript_extractor import TranscriptExtractor
from src.transcript_cleaner import TranscriptCleaner
from src.rss_generator import RSSGenerator
def load_config(config_path: str = "config/config.yaml") -> dict:
"""Config dosyasını yükle"""
with open(config_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def get_channel_id(config: dict) -> str:
"""Config'den channel ID'yi al (handle varsa dönüştür)"""
channel_config = config.get('channel', {})
# Channel ID direkt varsa
if channel_config.get('id'):
return channel_config['id']
# Handle URL varsa
if channel_config.get('handle_url'):
channel_id = get_channel_id_from_handle(channel_config['handle_url'])
if channel_id:
return channel_id
# Handle varsa
if channel_config.get('handle'):
handle_url = f"https://www.youtube.com/{channel_config['handle']}"
channel_id = get_channel_id_from_handle(handle_url)
if channel_id:
return channel_id
raise ValueError("Channel ID bulunamadı! Config'de id, handle veya handle_url belirtin.")
def main():
"""Ana pipeline"""
print("YouTube Transcript RSS Feed Generator başlatılıyor...")
# Config yükle
config = load_config()
# Channel ID al
channel_id = get_channel_id(config)
print(f"Channel ID: {channel_id}")
# Database başlat
db = Database()
db.init_database()
# RSS-Bridge'den videoları çek
rss_bridge_config = config.get('rss_bridge', {})
print(f"RSS-Bridge'den videolar çekiliyor...")
try:
videos = fetch_videos_from_rss_bridge(
base_url=rss_bridge_config.get('base_url', 'https://rss-bridge.org/bridge01'),
channel_id=channel_id,
format=rss_bridge_config.get('format', 'Atom'),
max_items=rss_bridge_config.get('max_items', 100)
)
print(f"{len(videos)} video bulundu")
except Exception as e:
print(f"Hata: {e}")
return
# Yeni videoları veritabanına ekle
new_count = 0
for video in videos:
video['channel_id'] = channel_id
if not db.is_video_processed(video['video_id']):
db.add_video(video)
new_count += 1
print(f"{new_count} yeni video eklendi")
# Bekleyen videoları işle
pending_videos = db.get_pending_videos()
print(f"{len(pending_videos)} video işlenmeyi bekliyor")
if pending_videos:
extractor = TranscriptExtractor()
cleaner = TranscriptCleaner()
transcript_config = config.get('transcript', {})
for video in pending_videos[:10]: # İlk 10 video (test için)
print(f"İşleniyor: {video['video_title']}")
# Transcript çıkar
transcript = extractor.fetch_transcript(
video['video_id'],
languages=transcript_config.get('languages', ['en'])
)
if transcript:
# Transcript temizle
raw, clean = cleaner.clean_transcript(
transcript,
sentences_per_paragraph=transcript_config.get('paragraph_length', 3)
)
# Veritabanına kaydet
db.update_video_transcript(
video['video_id'],
raw,
clean,
status=1, # Başarılı
language=transcript_config.get('languages', ['en'])[0]
)
print(f"✓ Tamamlandı: {video['video_title']}")
else:
# Başarısız olarak işaretle
db.mark_video_failed(video['video_id'], "Transcript bulunamadı")
print(f"✗ Başarısız: {video['video_title']}")
# RSS feed oluştur
processed_videos = db.get_processed_videos(
limit=config.get('automation', {}).get('max_items', 100),
channel_id=channel_id
)
if processed_videos:
channel_info = {
'id': channel_id,
'title': config.get('rss', {}).get('title', 'Transcript Feed'),
'link': config.get('channel', {}).get('url', ''),
'description': config.get('rss', {}).get('description', ''),
'language': config.get('channel', {}).get('language', 'en')
}
generator = RSSGenerator(channel_info)
for video in processed_videos:
generator.add_video_entry(video)
output_file = config.get('rss', {}).get('output_file', 'transcript_feed.xml')
output_path = f"output/{output_file}"
os.makedirs('output', exist_ok=True)
generator.generate_rss(output_path)
print(f"RSS feed oluşturuldu: {output_path}")
db.close()
print("Tamamlandı!")
if __name__ == "__main__":
main()