first commit
This commit is contained in:
286
src/web_server.py
Normal file
286
src/web_server.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Flask web server - RSS-Bridge benzeri URL template sistemi
|
||||
"""
|
||||
from flask import Flask, request, Response, jsonify
|
||||
from typing import Optional
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from src.database import Database
|
||||
from src.video_fetcher import fetch_videos_from_rss_bridge, get_channel_id_from_handle, extract_video_id
|
||||
from src.transcript_extractor import TranscriptExtractor
|
||||
from src.transcript_cleaner import TranscriptCleaner
|
||||
from src.rss_generator import RSSGenerator
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Global instances (lazy loading)
|
||||
db = None
|
||||
extractor = None
|
||||
cleaner = None
|
||||
|
||||
|
||||
def get_db():
|
||||
"""Database instance'ı al (singleton)"""
|
||||
global db
|
||||
if db is None:
|
||||
db = Database()
|
||||
db.init_database()
|
||||
return db
|
||||
|
||||
|
||||
def get_extractor():
|
||||
"""Transcript extractor instance'ı al"""
|
||||
global extractor
|
||||
if extractor is None:
|
||||
extractor = TranscriptExtractor()
|
||||
return extractor
|
||||
|
||||
|
||||
def get_cleaner():
|
||||
"""Transcript cleaner instance'ı al"""
|
||||
global cleaner
|
||||
if cleaner is None:
|
||||
cleaner = TranscriptCleaner()
|
||||
return cleaner
|
||||
|
||||
|
||||
def normalize_channel_id(channel_id: Optional[str] = None,
|
||||
channel: Optional[str] = None,
|
||||
channel_url: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Farklı formatlardan channel ID'yi normalize et
|
||||
|
||||
Args:
|
||||
channel_id: Direkt Channel ID (UC...)
|
||||
channel: Channel handle (@username) veya username
|
||||
channel_url: Full YouTube channel URL
|
||||
|
||||
Returns:
|
||||
Normalize edilmiş Channel ID veya None
|
||||
"""
|
||||
# Direkt Channel ID varsa
|
||||
if channel_id:
|
||||
if channel_id.startswith('UC') and len(channel_id) == 24:
|
||||
return channel_id
|
||||
# Eğer URL formatında ise parse et
|
||||
if 'youtube.com/channel/' in channel_id:
|
||||
parts = channel_id.split('/channel/')
|
||||
if len(parts) > 1:
|
||||
return parts[-1].split('?')[0].split('/')[0]
|
||||
|
||||
# Channel handle (@username)
|
||||
if channel:
|
||||
if not channel.startswith('@'):
|
||||
channel = f"@{channel}"
|
||||
handle_url = f"https://www.youtube.com/{channel}"
|
||||
return get_channel_id_from_handle(handle_url)
|
||||
|
||||
# Channel URL
|
||||
if channel_url:
|
||||
# Handle URL
|
||||
if '/@' in channel_url:
|
||||
return get_channel_id_from_handle(channel_url)
|
||||
# Channel ID URL
|
||||
elif '/channel/' in channel_url:
|
||||
parts = channel_url.split('/channel/')
|
||||
if len(parts) > 1:
|
||||
return parts[-1].split('?')[0].split('/')[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def process_channel(channel_id: str, max_items: int = 50) -> dict:
|
||||
"""
|
||||
Kanal için transcript feed'i oluştur
|
||||
|
||||
Returns:
|
||||
RSS feed string ve metadata
|
||||
"""
|
||||
db = get_db()
|
||||
extractor = get_extractor()
|
||||
cleaner = get_cleaner()
|
||||
|
||||
# RSS-Bridge'den videoları çek
|
||||
try:
|
||||
videos = fetch_videos_from_rss_bridge(
|
||||
base_url="https://rss-bridge.org/bridge01",
|
||||
channel_id=channel_id,
|
||||
format="Atom",
|
||||
max_items=max_items
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"RSS-Bridge hatası: {e}")
|
||||
|
||||
# Yeni videoları veritabanına ekle
|
||||
for video in videos:
|
||||
video['channel_id'] = channel_id
|
||||
if not db.is_video_processed(video['video_id']):
|
||||
db.add_video(video)
|
||||
|
||||
# Bekleyen videoları işle (ilk 20)
|
||||
pending_videos = db.get_pending_videos()[:20]
|
||||
|
||||
for video in pending_videos:
|
||||
if video['channel_id'] != channel_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Transcript çıkar
|
||||
transcript = extractor.fetch_transcript(
|
||||
video['video_id'],
|
||||
languages=['tr', 'en']
|
||||
)
|
||||
|
||||
if transcript:
|
||||
# Transcript temizle
|
||||
raw, clean = cleaner.clean_transcript(transcript, sentences_per_paragraph=3)
|
||||
|
||||
# Veritabanına kaydet
|
||||
db.update_video_transcript(
|
||||
video['video_id'],
|
||||
raw,
|
||||
clean,
|
||||
status=1,
|
||||
language='tr'
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Transcript çıkarım hatası {video['video_id']}: {e}")
|
||||
db.mark_video_failed(video['video_id'], str(e))
|
||||
|
||||
# İşlenmiş videoları getir
|
||||
processed_videos = db.get_processed_videos(
|
||||
limit=max_items,
|
||||
channel_id=channel_id
|
||||
)
|
||||
|
||||
return {
|
||||
'videos': processed_videos,
|
||||
'channel_id': channel_id,
|
||||
'count': len(processed_videos)
|
||||
}
|
||||
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def generate_feed():
|
||||
"""
|
||||
RSS-Bridge benzeri URL template:
|
||||
|
||||
Örnekler:
|
||||
- /?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom
|
||||
- /?channel=@tavakfi&format=Atom
|
||||
- /?channel_url=https://www.youtube.com/@tavakfi&format=Atom
|
||||
"""
|
||||
# Query parametrelerini al
|
||||
channel_id = request.args.get('channel_id')
|
||||
channel = request.args.get('channel') # @username veya username
|
||||
channel_url = request.args.get('channel_url')
|
||||
format_type = request.args.get('format', 'Atom').lower() # Atom veya Rss
|
||||
max_items = int(request.args.get('max_items', 50))
|
||||
|
||||
# Channel ID'yi normalize et
|
||||
normalized_channel_id = normalize_channel_id(
|
||||
channel_id=channel_id,
|
||||
channel=channel,
|
||||
channel_url=channel_url
|
||||
)
|
||||
|
||||
if not normalized_channel_id:
|
||||
return jsonify({
|
||||
'error': 'Channel ID bulunamadı',
|
||||
'usage': {
|
||||
'channel_id': 'UC... (YouTube Channel ID)',
|
||||
'channel': '@username veya username',
|
||||
'channel_url': 'https://www.youtube.com/@username veya https://www.youtube.com/channel/UC...',
|
||||
'format': 'Atom veya Rss (varsayılan: Atom)',
|
||||
'max_items': 'Maksimum video sayısı (varsayılan: 50)'
|
||||
}
|
||||
}), 400
|
||||
|
||||
try:
|
||||
# Kanalı işle
|
||||
result = process_channel(normalized_channel_id, max_items=max_items)
|
||||
|
||||
if not result['videos']:
|
||||
return jsonify({
|
||||
'error': 'Henüz işlenmiş video yok',
|
||||
'channel_id': normalized_channel_id,
|
||||
'message': 'Lütfen birkaç dakika sonra tekrar deneyin'
|
||||
}), 404
|
||||
|
||||
# RSS feed oluştur
|
||||
channel_info = {
|
||||
'id': normalized_channel_id,
|
||||
'title': f"YouTube Transcript Feed - {normalized_channel_id}",
|
||||
'link': f"https://www.youtube.com/channel/{normalized_channel_id}",
|
||||
'description': f'Full-text transcript RSS feed for channel {normalized_channel_id}',
|
||||
'language': 'en'
|
||||
}
|
||||
|
||||
generator = RSSGenerator(channel_info)
|
||||
|
||||
for video in result['videos']:
|
||||
generator.add_video_entry(video)
|
||||
|
||||
# Format'a göre döndür
|
||||
if format_type == 'rss':
|
||||
rss_content = generator.generate_rss_string()
|
||||
return Response(
|
||||
rss_content,
|
||||
mimetype='application/rss+xml',
|
||||
headers={'Content-Type': 'application/rss+xml; charset=utf-8'}
|
||||
)
|
||||
else: # Atom
|
||||
# Feedgen Atom desteği
|
||||
atom_content = generator.generate_atom_string()
|
||||
return Response(
|
||||
atom_content,
|
||||
mimetype='application/atom+xml',
|
||||
headers={'Content-Type': 'application/atom+xml; charset=utf-8'}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'error': str(e),
|
||||
'channel_id': normalized_channel_id
|
||||
}), 500
|
||||
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
"""Health check endpoint"""
|
||||
return jsonify({'status': 'ok', 'service': 'YouTube Transcript RSS Feed'})
|
||||
|
||||
|
||||
@app.route('/info', methods=['GET'])
|
||||
def info():
|
||||
"""API bilgileri"""
|
||||
return jsonify({
|
||||
'service': 'YouTube Transcript RSS Feed Generator',
|
||||
'version': '1.0.0',
|
||||
'endpoints': {
|
||||
'/': 'RSS Feed Generator',
|
||||
'/health': 'Health Check',
|
||||
'/info': 'API Info'
|
||||
},
|
||||
'usage': {
|
||||
'channel_id': 'UC... (YouTube Channel ID)',
|
||||
'channel': '@username veya username',
|
||||
'channel_url': 'Full YouTube channel URL',
|
||||
'format': 'Atom veya Rss (varsayılan: Atom)',
|
||||
'max_items': 'Maksimum video sayısı (varsayılan: 50)'
|
||||
},
|
||||
'examples': [
|
||||
'/?channel_id=UC9h8BDcXwkhZtnqoQJ7PggA&format=Atom',
|
||||
'/?channel=@tavakfi&format=Rss',
|
||||
'/?channel_url=https://www.youtube.com/@tavakfi&format=Atom&max_items=100'
|
||||
]
|
||||
})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=True)
|
||||
|
||||
Reference in New Issue
Block a user