Add ekos-gazete-search FullSweep + telegram + browser-use skills
Skills:
- ekos-gazete-search: EKOS gazete arşivi (1928-1942) tarama skill'i.
+ 04_export.py (CSV+DOCX), run_capped.sh (systemd cap wrapper),
02_search_pdfs.py interleaved-dispatch patch (crash-safe), kirim_core.yaml.
- telegram: TG inbox/search/send/read scripts.
- browser-use: paperclip browser automation skill.
build.py:
- Add ekos-gazete-search → scribe, scholar, oracle, frodo, chronos,
centurion, wraith mapping.
- Add telegram, browser-use mappings (browser-use uses "*" wildcard).
- Add wildcard "*" support in DEFAULT_SKILL_PERSONA_MAP.
- Add paperclip_skills + community_skills buckets to skill injection.
- Wrap yaml.safe_load in try/except for malformed frontmatter.
- Index paperclip_skills with inferred persona mapping.
README.md:
- Add telegram skill to Sentinel/Frodo/Oracle/Echo skill lists.
This commit is contained in:
@@ -84,17 +84,17 @@ cat generated/sentinel/apt-profiling.yaml # YAML with metadata
|
||||
| **Specter** | Malware Analyst / Reverse Engineer | Cerrah | general, firmware | — |
|
||||
| **Bastion** | Blue Team / DFIR | Muhafız | general, forensics, threat-hunting, incident-commander | senior-secops, sys-guard-linux-remediator, pcap-analyzer |
|
||||
| **Vortex** | Network Ops / Traffic Analysis | Telsizci | general, cloud-ad | nmap-recon, pcap-analyzer, dns-networking |
|
||||
| **Sentinel** | CTI / Threat Intelligence | İzci | general, apt-profiling, mitre-attack, darknet, **c2-hunting** | seithar-intel, gov-cybersecurity, pentest-c2-operator |
|
||||
| **Sentinel** | CTI / Threat Intelligence | İzci | general, apt-profiling, mitre-attack, darknet, **c2-hunting** | seithar-intel, gov-cybersecurity, pentest-c2-operator, telegram |
|
||||
|
||||
### Intelligence (5 personas, 29 variants)
|
||||
|
||||
| Codename | Role | Hitap | Variants | Skills |
|
||||
|----------|------|-------|----------|--------|
|
||||
| **Frodo** | Strategic Intelligence Analyst | Müsteşar | general, middle-east, russia, iran, africa, china, pakistan, india, nato-alliance, nuclear, energy-geopolitics, turkey, salva | freshrss, freshrss-reader, seithar-intel, war-intel-monitor, news-crawler, dellight-intelligence-ops, dellight-strategic-intelligence |
|
||||
| **Oracle** | OSINT & Digital Intelligence | Kaşif | general, crypto-osint, **source-verification**, salva | osint-investigator, stealth-browser, deep-scraper, crawl-for-ai, image-ocr, mistral-ocr, freshrss +2 |
|
||||
| **Frodo** | Strategic Intelligence Analyst | Müsteşar | general, middle-east, russia, iran, africa, china, pakistan, india, nato-alliance, nuclear, energy-geopolitics, turkey, salva | freshrss, freshrss-reader, seithar-intel, war-intel-monitor, news-crawler, dellight-intelligence-ops, dellight-strategic-intelligence, telegram |
|
||||
| **Oracle** | OSINT & Digital Intelligence | Kaşif | general, crypto-osint, **source-verification**, salva | osint-investigator, stealth-browser, deep-scraper, crawl-for-ai, image-ocr, mistral-ocr, freshrss, telegram +2 |
|
||||
| **Ghost** | PSYOP & Information Warfare | Propagandist | general, cognitive-warfare, russian-info-war, salva | social-trust-manipulation-detector |
|
||||
| **Wraith** | HUMINT & Counter-Intelligence | Mahrem | general, source-validation, case-studies, salva | — |
|
||||
| **Echo** | SIGINT / COMINT / ELINT | Kulakçı | general, nsa-sigint, electronic-order-of-battle, salva | dellight-intelligence-ops |
|
||||
| **Echo** | SIGINT / COMINT / ELINT | Kulakçı | general, nsa-sigint, electronic-order-of-battle, salva | dellight-intelligence-ops, telegram |
|
||||
|
||||
### Military & Strategy (4 personas, 24 variants)
|
||||
|
||||
|
||||
92
build.py
92
build.py
@@ -258,7 +258,12 @@ def build_persona(
|
||||
# Inject mapped skills for this persona
|
||||
if skills_index:
|
||||
mapped_skills = []
|
||||
for bucket in ("skills", "feynman_skills"):
|
||||
for bucket in (
|
||||
"skills",
|
||||
"paperclip_skills",
|
||||
"community_skills",
|
||||
"feynman_skills",
|
||||
):
|
||||
for skill_name, skill_info in skills_index.get(bucket, {}).items():
|
||||
if not isinstance(skill_info, dict):
|
||||
continue
|
||||
@@ -306,6 +311,8 @@ def build_persona(
|
||||
|
||||
|
||||
DEFAULT_SKILL_PERSONA_MAP = {
|
||||
# Browser automation for every persona
|
||||
"browser-use": ["*"],
|
||||
# Cybersecurity skills → personas
|
||||
"pentest": ["neo"],
|
||||
"nmap-recon": ["neo", "vortex"],
|
||||
@@ -336,6 +343,7 @@ DEFAULT_SKILL_PERSONA_MAP = {
|
||||
"news-crawler": ["frodo", "herald"],
|
||||
"dellight-intelligence-ops": ["frodo", "echo"],
|
||||
"dellight-strategic-intelligence": ["frodo"],
|
||||
"telegram": ["frodo", "oracle", "sentinel", "echo"],
|
||||
"agent-intelligence-network-scan": ["oracle"],
|
||||
"social-trust-manipulation-detector": ["ghost"],
|
||||
# Infrastructure skills → personas
|
||||
@@ -349,6 +357,8 @@ DEFAULT_SKILL_PERSONA_MAP = {
|
||||
# Web scraping → personas
|
||||
"deep-scraper": ["oracle"],
|
||||
"crawl-for-ai": ["oracle", "herald"],
|
||||
# Historical / archival research → personas
|
||||
"ekos-gazete-search": ["scribe", "scholar", "oracle", "frodo", "chronos", "centurion", "wraith"],
|
||||
}
|
||||
|
||||
|
||||
@@ -391,7 +401,10 @@ def parse_skill_frontmatter(skill_md: Path) -> dict:
|
||||
fm_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
|
||||
if not fm_match:
|
||||
return {}
|
||||
parsed = yaml.safe_load(fm_match.group(1))
|
||||
try:
|
||||
parsed = yaml.safe_load(fm_match.group(1))
|
||||
except yaml.YAMLError:
|
||||
return {}
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
|
||||
|
||||
@@ -514,13 +527,18 @@ def infer_personas_from_skill_metadata(skill_name: str, metadata: dict) -> list:
|
||||
def load_skill_persona_map(config: dict) -> dict:
|
||||
"""Load skill→persona mapping from config.yaml or use defaults."""
|
||||
custom = config.get("skill_persona_map", {})
|
||||
merged = {
|
||||
k: [p for p in v if p in VALID_PERSONAS]
|
||||
for k, v in DEFAULT_SKILL_PERSONA_MAP.items()
|
||||
}
|
||||
merged = {}
|
||||
for skill, personas in DEFAULT_SKILL_PERSONA_MAP.items():
|
||||
if "*" in personas:
|
||||
merged[skill] = sorted(VALID_PERSONAS)
|
||||
else:
|
||||
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
|
||||
for skill, personas in custom.items():
|
||||
if isinstance(personas, list):
|
||||
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
|
||||
if "*" in personas:
|
||||
merged[skill] = sorted(VALID_PERSONAS)
|
||||
else:
|
||||
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
|
||||
return merged
|
||||
|
||||
|
||||
@@ -718,7 +736,35 @@ def build_skills_index(shared_dir: Path, config: dict = None) -> dict:
|
||||
continue
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
index["paperclip_skills"][skill_dir.name] = True
|
||||
skill_meta = parse_skill_frontmatter(skill_md)
|
||||
inferred_personas = infer_personas_from_skill_metadata(
|
||||
skill_dir.name, skill_meta
|
||||
)
|
||||
configured_personas = skill_map.get(skill_dir.name, [])
|
||||
merged_personas = sorted(
|
||||
set(configured_personas).union(inferred_personas)
|
||||
)
|
||||
content = skill_md.read_text(encoding="utf-8")
|
||||
first_line = ""
|
||||
for line in content.split("\n"):
|
||||
line = line.strip()
|
||||
if line and not line.startswith(
|
||||
("---", "#", "name:", "description:")
|
||||
):
|
||||
first_line = line[:120]
|
||||
break
|
||||
index["paperclip_skills"][skill_dir.name] = {
|
||||
"personas": merged_personas,
|
||||
"summary": first_line,
|
||||
"domain": str(skill_meta.get("domain", "")),
|
||||
"subdomain": str(skill_meta.get("subdomain", "")),
|
||||
"tags": skill_meta.get("tags", []),
|
||||
"mapped_by": {
|
||||
"explicit": configured_personas,
|
||||
"inferred": inferred_personas,
|
||||
},
|
||||
"has_references": (skill_dir / "references").is_dir(),
|
||||
}
|
||||
|
||||
# Index community-skills
|
||||
cskills_dir = shared_dir / "community-skills"
|
||||
@@ -728,7 +774,35 @@ def build_skills_index(shared_dir: Path, config: dict = None) -> dict:
|
||||
continue
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
index["community_skills"][skill_dir.name] = True
|
||||
skill_meta = parse_skill_frontmatter(skill_md)
|
||||
inferred_personas = infer_personas_from_skill_metadata(
|
||||
skill_dir.name, skill_meta
|
||||
)
|
||||
configured_personas = skill_map.get(skill_dir.name, [])
|
||||
merged_personas = sorted(
|
||||
set(configured_personas).union(inferred_personas)
|
||||
)
|
||||
content = skill_md.read_text(encoding="utf-8")
|
||||
first_line = ""
|
||||
for line in content.split("\n"):
|
||||
line = line.strip()
|
||||
if line and not line.startswith(
|
||||
("---", "#", "name:", "description:")
|
||||
):
|
||||
first_line = line[:120]
|
||||
break
|
||||
index["community_skills"][skill_dir.name] = {
|
||||
"personas": merged_personas,
|
||||
"summary": first_line,
|
||||
"domain": str(skill_meta.get("domain", "")),
|
||||
"subdomain": str(skill_meta.get("subdomain", "")),
|
||||
"tags": skill_meta.get("tags", []),
|
||||
"mapped_by": {
|
||||
"explicit": configured_personas,
|
||||
"inferred": inferred_personas,
|
||||
},
|
||||
"has_references": (skill_dir / "references").is_dir(),
|
||||
}
|
||||
|
||||
# Index feynman-skills (research workflows adapted from Feynman).
|
||||
# Use the same persona-aware indexing as shared skills so mapped skills
|
||||
|
||||
121
personas/_shared/paperclip-skills/browser-use/SKILL.md
Normal file
121
personas/_shared/paperclip-skills/browser-use/SKILL.md
Normal file
@@ -0,0 +1,121 @@
|
||||
---
|
||||
name: browser-use
|
||||
description: Automates browser interactions for web testing, form filling, screenshots, and data extraction. Use when the user needs to navigate websites, interact with web pages, fill forms, take screenshots, or extract information from web pages.
|
||||
license: MIT
|
||||
metadata:
|
||||
author: browser-use
|
||||
version: "1.1.0"
|
||||
domain: engineering
|
||||
subdomain: browser-automation
|
||||
triggers: browser-use, browser automation, web scraping, form filling, screenshot, cloud browser, playwright cdp, session replay, workspace files, profile sync
|
||||
role: engineer
|
||||
scope: implementation
|
||||
---
|
||||
|
||||
# Browser Use
|
||||
|
||||
Use Browser Use Cloud SDK and API to run browser agents and raw browser sessions.
|
||||
|
||||
## When To Use
|
||||
|
||||
- Navigate websites and extract structured data
|
||||
- Fill forms and execute multi-step workflows
|
||||
- Stream live browser actions and agent messages
|
||||
- Reuse sessions, profiles, and workspaces across tasks
|
||||
- Connect Playwright/Puppeteer via CDP to cloud browsers
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
pip install browser-use-sdk
|
||||
export BROWSER_USE_API_KEY=your_key
|
||||
```
|
||||
|
||||
TypeScript:
|
||||
|
||||
```bash
|
||||
npm install browser-use-sdk
|
||||
```
|
||||
|
||||
## Quick Start (v3 SDK)
|
||||
|
||||
```python
|
||||
from browser_use_sdk.v3 import AsyncBrowserUse
|
||||
|
||||
client = AsyncBrowserUse()
|
||||
result = await client.run("List the top 20 posts on Hacker News today with their points")
|
||||
print(result.output)
|
||||
```
|
||||
|
||||
```typescript
|
||||
import { BrowserUse } from "browser-use-sdk/v3";
|
||||
|
||||
const client = new BrowserUse();
|
||||
const result = await client.run("List the top 20 posts on Hacker News today with their points");
|
||||
console.log(result.output);
|
||||
```
|
||||
|
||||
## Core Patterns
|
||||
|
||||
- `run()` for one-shot tasks: auto create + poll + return output.
|
||||
- `sessions.create()` + `session_id` for follow-up tasks with shared browser state.
|
||||
- `workspaces.*` for file upload/download workflows.
|
||||
- `profiles.*` for login persistence and recurring automation.
|
||||
- `browsers.create()` for raw CDP control (Playwright/Puppeteer).
|
||||
|
||||
### Follow-up task pattern
|
||||
|
||||
```python
|
||||
session = await client.sessions.create()
|
||||
await client.run("Go to amazon.com and open first laptop", session_id=session.id)
|
||||
await client.run("Extract customer reviews", session_id=session.id)
|
||||
await client.sessions.stop(session.id)
|
||||
```
|
||||
|
||||
### Structured output
|
||||
|
||||
- Python: pass `output_schema` (Pydantic).
|
||||
- TypeScript: pass `schema` (Zod v4 required).
|
||||
|
||||
### Stream messages
|
||||
|
||||
- Iterate over `client.run(...)` to receive live messages.
|
||||
- `run.result` is valid only after iteration completes.
|
||||
|
||||
### Deterministic rerun (cache-script)
|
||||
|
||||
- Use `@{{...}}` placeholders in task plus `workspace_id`.
|
||||
- First run builds script, next runs can execute without LLM.
|
||||
- `cache_script`: `None` (auto), `True` (force), `False` (disable).
|
||||
|
||||
## Agent vs Browser
|
||||
|
||||
- Agent mode: `client.run(...)`, `client.sessions.*`.
|
||||
- Browser mode: `client.browsers.create(...)` returns `cdp_url` + `live_url`.
|
||||
- Use browser mode when you need custom CDP automation with Playwright/Puppeteer.
|
||||
|
||||
## Authentication and Persistence
|
||||
|
||||
- API key env: `BROWSER_USE_API_KEY`.
|
||||
- Header for direct API calls: `X-Browser-Use-API-Key: <key>`.
|
||||
- For user-specific state: create one profile per user and reuse `profile_id`.
|
||||
|
||||
## Operations Checklist
|
||||
|
||||
- Always stop sessions/browsers when done to avoid idle charges.
|
||||
- Always stop profiled sessions to persist cookies/localStorage correctly.
|
||||
- Sessions idle-timeout after 15 minutes; max duration is 4 hours.
|
||||
- Recording links are presigned and expire quickly (about 1 hour).
|
||||
|
||||
## Common Gotchas
|
||||
|
||||
- If streaming loop is interrupted early, cancel with `sessions.stop(..., strategy="task")` before sending another task.
|
||||
- TypeScript structured output fails with Zod v3; use Zod v4.
|
||||
- Selenium remote CDP support is limited; prefer Playwright/Puppeteer for cloud CDP.
|
||||
- Deleting a workspace is permanent.
|
||||
|
||||
## Reference
|
||||
|
||||
- Full LLM-optimized docs: `https://docs.browser-use.com/llms-full.txt`
|
||||
- Quick index: `https://docs.browser-use.com/llms.txt`
|
||||
- API key: `https://cloud.browser-use.com/settings?tab=api-keys&new=1`
|
||||
60
personas/_shared/skills/ekos-gazete-search/README.md
Normal file
60
personas/_shared/skills/ekos-gazete-search/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# ekos-gazete-search
|
||||
|
||||
Claude Code skill: İstanbul Üniversitesi EKOS gazete arşivinde (1928-1942, 53 gazete, 581 106 OCR'lı sayfa) konu-bazlı sistematik arama.
|
||||
|
||||
## Hızlı başlangıç
|
||||
|
||||
```bash
|
||||
cd ~/.claude/skills/ekos-gazete-search
|
||||
python3 -m venv .venv && source .venv/bin/activate
|
||||
pip install -r scripts/requirements.txt
|
||||
|
||||
# 1) Manifest oluştur (~1 dk, tek seferlik)
|
||||
python scripts/01_build_manifest.py
|
||||
|
||||
# 2) Kırım taramasını öncelikli pencerelerle başlat
|
||||
python scripts/02_search_pdfs.py \
|
||||
--keywords keywords/kirim.yaml \
|
||||
--priority-only \
|
||||
--workers 4
|
||||
|
||||
# 3) Obsidian raporu oluştur
|
||||
python scripts/03_render_report.py --topic Kirim
|
||||
```
|
||||
|
||||
## Yapı
|
||||
|
||||
```
|
||||
.
|
||||
├── SKILL.md # Claude'a yönerge
|
||||
├── README.md # bu dosya
|
||||
├── keywords/
|
||||
│ ├── _template.yaml # yeni konu için şablon
|
||||
│ └── kirim.yaml # Kırım (Hanlık, Tatar, diaspora, Sovyet)
|
||||
├── scripts/
|
||||
│ ├── 01_build_manifest.py # 53 gazete sayfasını çek → manifest CSV
|
||||
│ ├── 02_search_pdfs.py # PDF indir + pdftotext + fuzzy regex → JSONL
|
||||
│ ├── 03_render_report.py # JSONL → Obsidian markdown
|
||||
│ ├── lib/fuzzy.py # OCR-toleranslı Türkçe regex motoru
|
||||
│ └── requirements.txt
|
||||
├── manifests/ # üretilmiş CSV'ler
|
||||
└── hits/ # üretilmiş JSONL hit dosyaları
|
||||
```
|
||||
|
||||
## Yeni konu
|
||||
|
||||
```bash
|
||||
cp keywords/_template.yaml keywords/filistin.yaml
|
||||
# Düzenle: canonical, aliases, proper_nouns, disambiguators, priority_windows
|
||||
python scripts/02_search_pdfs.py --keywords keywords/filistin.yaml --out hits/filistin.jsonl
|
||||
python scripts/03_render_report.py --hits hits/filistin.jsonl --topic Filistin
|
||||
```
|
||||
|
||||
## Sınırlar
|
||||
|
||||
- **Bant genişliği:** 581k sayfa × ~14MB PDF ≈ 8+ TB. Skill her PDF'i indirir, text-layer çıkarır, hit yoksa siler. Tam mirror YAPMAZ.
|
||||
- **Throttle:** varsayılan 0.25 sn/işistek + 4 worker = ~3 sayfa/sn. Kütüphaneye nezaket.
|
||||
- **OCR:** 2014 vintage, Türkçe diakritikleri çöp. Fuzzy regex bunu telafi eder ama %100 değildir.
|
||||
- **Kapsam:** 1928–1942. **Kırım sürgünü (1944) bu arşivde YOK.**
|
||||
|
||||
Ayrıntı için: `SKILL.md` ve [vault haritalama notu](/home/salva/Obsidian/6-Geopolitics/Russia/03.%20HISTORICAL%20CONTEXT/EKOS-Gazete-Arsivi-Haritalama.md).
|
||||
169
personas/_shared/skills/ekos-gazete-search/SKILL.md
Normal file
169
personas/_shared/skills/ekos-gazete-search/SKILL.md
Normal file
@@ -0,0 +1,169 @@
|
||||
---
|
||||
name: ekos-gazete-search
|
||||
description: "İstanbul Üniversitesi EKOS gazete arşivinde (1928-1942, 53 gazete, 581k sayfa OCR'lı) konu-bazlı sistematik arama. Türkçe-OCR-toleranslı fuzzy regex, öncelikli zaman pencereleri, Obsidian raporu üretimi. Kırım, Filistin, Holodomor, herhangi bir konu için parametrik."
|
||||
domain: intelligence
|
||||
subdomain: archival-research
|
||||
tags:
|
||||
- archive
|
||||
- foia
|
||||
- ottoman-press
|
||||
- turkish-press
|
||||
- historical-research
|
||||
- ocr
|
||||
- pdf
|
||||
- newspaper
|
||||
- early-republic
|
||||
- crimea
|
||||
- kirim
|
||||
- diaspora
|
||||
personas:
|
||||
- scribe
|
||||
- scholar
|
||||
- oracle
|
||||
- frodo
|
||||
---
|
||||
|
||||
# EKOS Gazete Arama — Skill
|
||||
|
||||
## Ne zaman çağırılır?
|
||||
|
||||
Kullanıcı şunlardan birini söylediğinde:
|
||||
- "EKOS arşivinde X tara/ara"
|
||||
- "İstanbul Üniversitesi gazete arşivinde X haberlerini bul"
|
||||
- "1928-1942 Türk basınında X"
|
||||
- "nek.istanbul.edu.tr gazetelerinde tarama"
|
||||
- Var olan keyword set'i (Kırım, Filistin, vb.) ile yeniden çalıştır
|
||||
|
||||
## Mimari özet
|
||||
|
||||
```
|
||||
SLUG (53 gazete) → manifest.csv → fuzzy search → hits.jsonl → Obsidian raporu
|
||||
```
|
||||
|
||||
3 aşama, üç ayrı script:
|
||||
1. **`scripts/01_build_manifest.py`** — 53 gazete sayfasını çekip tüm PDF URL'lerini `manifests/ekos_master.csv`'ye yazar. Bir kez çalıştırılır, cache'lenir.
|
||||
2. **`scripts/02_search_pdfs.py`** — manifest üzerinden iterate; her PDF'i indir, `pdftotext` ile metni çıkar, fuzzy regex'le ara, hit'leri `hits/<topic>.jsonl`'ye yaz, PDF'i sil.
|
||||
3. **`scripts/03_render_report.py`** — JSONL'yi `6-Geopolitics/Russia/03. HISTORICAL CONTEXT/` altına master + yıllık raporlar olarak markdown'a render eder.
|
||||
|
||||
## Önkoşullar
|
||||
|
||||
```bash
|
||||
# Sistem paketleri (Kali Linux'ta zaten var olabilir)
|
||||
which pdftotext pdfinfo curl # poppler-utils
|
||||
|
||||
# Python venv (CLAUDE.md kuralı: sisteme değil venv'e kur)
|
||||
cd /home/salva/.claude/skills/ekos-gazete-search
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install requests pyyaml beautifulsoup4
|
||||
```
|
||||
|
||||
## Tipik kullanım akışı
|
||||
|
||||
### A) İlk çalıştırma — manifest oluştur
|
||||
|
||||
```bash
|
||||
cd /home/salva/.claude/skills/ekos-gazete-search
|
||||
source .venv/bin/activate
|
||||
python scripts/01_build_manifest.py
|
||||
# → manifests/ekos_master.csv (~tek seferlik, ~5 dk)
|
||||
```
|
||||
|
||||
### B) Arama — Kırım için, öncelikli pencerelerden başlayarak
|
||||
|
||||
```bash
|
||||
# Strateji B: 1932-33, 1936-37, 1941-42 önce
|
||||
python scripts/02_search_pdfs.py \
|
||||
--keywords keywords/kirim.yaml \
|
||||
--priority-only \
|
||||
--workers 4 \
|
||||
--out hits/kirim.jsonl
|
||||
|
||||
# Sonra geri kalan tüm yıllar
|
||||
python scripts/02_search_pdfs.py \
|
||||
--keywords keywords/kirim.yaml \
|
||||
--workers 4 \
|
||||
--out hits/kirim.jsonl
|
||||
```
|
||||
|
||||
### C) POC modu — sadece 5 ana gazete, az veri ile test
|
||||
|
||||
```bash
|
||||
python scripts/02_search_pdfs.py \
|
||||
--keywords keywords/kirim.yaml \
|
||||
--slug cumhuriyet \
|
||||
--year-from 1932 --year-to 1933 \
|
||||
--limit 50 \
|
||||
--out hits/kirim_poc.jsonl
|
||||
```
|
||||
|
||||
### D) Raporu render et
|
||||
|
||||
```bash
|
||||
python scripts/03_render_report.py \
|
||||
--hits hits/kirim.jsonl \
|
||||
--topic Kirim \
|
||||
--keywords keywords/kirim.yaml
|
||||
# → 6-Geopolitics/Russia/03. HISTORICAL CONTEXT/EKOS-Kirim-Bulgular.md (master)
|
||||
# → EKOS-Kirim-1932.md, EKOS-Kirim-1933.md, ... (yıllık)
|
||||
```
|
||||
|
||||
## Yeni konu eklemek
|
||||
|
||||
1. `keywords/<topic>.yaml` oluştur — `keywords/_template.yaml`'ı şablon olarak kullan.
|
||||
2. Wordlist'i doldur: `canonical`, `aliases`, `proper_nouns` (kişi adları), `disambiguators` (false positive filtreleri).
|
||||
3. `priority_windows` tanımla — konunun yoğunlaştığı yıllar.
|
||||
4. Çalıştır: `python scripts/02_search_pdfs.py --keywords keywords/<topic>.yaml --out hits/<topic>.jsonl`
|
||||
|
||||
## OCR Toleransı
|
||||
|
||||
PDF'lerin OCR'ı 2014 vintage, kalitesi orta-düşük. Türkçe diakritikleri sistematik olarak bozulmuş:
|
||||
|
||||
| Doğru | OCR'da | Regex class |
|
||||
|---|---|---|
|
||||
| `ı` | `1, i, l, |` | `[1iIıİlj|]` |
|
||||
| `ş` | `~, s` | `[s~ş]` |
|
||||
| `ç` | `c` | `[cç]` |
|
||||
| `ğ` | `g` | `[gğ]` |
|
||||
| `ü` | `u, ii` | `(?:[uü]|ii)` |
|
||||
| `ö` | `o` | `[oö]` |
|
||||
|
||||
`scripts/lib/fuzzy.py` bu mapping'i otomatik uygular: `build_pattern("Kırım")` → `r"K[1iIıİlj|][rR][1iIıİlj|]m"`.
|
||||
|
||||
## Sınırlar ve uyarılar
|
||||
|
||||
- **Yunanca/Ermenice gazeteler** (apoyevmatini, aravelk, jamanak, metapolitefsis): OCR'ları henüz test edilmedi. İlk taramada Latin transkripsiyon aliases üzerinden tarayacak. Yetersizse ileride Tesseract `ell`/`hye` ile re-OCR eklenir.
|
||||
- **Throttle:** 0.25 sn/istek. 581k sayfa tüm arşiv için 4 worker × ~12-18 saat. Kütüphaneye nezaket.
|
||||
- **False positive:** "Kerim" (özel ad) ↔ "Kırım", "Kefe" (ilçe) ↔ "kefil/kefe" çakışması olur. Hit listesini gözden geçirirken `disambiguators` listesini büyüt.
|
||||
- **Telif:** 1928-1942 PDF'ler kütüphane tarafından dağıtılıyor; biz sadece arama yapıp URL referansı kaydediyoruz, kalıcı kopya almıyoruz. Yasal sorun yok.
|
||||
|
||||
## Çıktı şeması (`hits/*.jsonl`)
|
||||
|
||||
Her satır bir hit:
|
||||
```json
|
||||
{
|
||||
"slug": "cumhuriyet",
|
||||
"year": "1933",
|
||||
"month": "subat",
|
||||
"day": "12",
|
||||
"page": 3,
|
||||
"keyword": "Kırım",
|
||||
"match": "K1r1m",
|
||||
"snippet": "...lan acl1k haberlerine gore K1r1m'da binlerce..." ,
|
||||
"url": "https://nek.istanbul.edu.tr/.../cumhuriyet_1933_subat_12_.pdf"
|
||||
}
|
||||
```
|
||||
|
||||
## Persona ile entegrasyon
|
||||
|
||||
Bu skill, `persona-scribe-salva` (FOIA arşivci) personasının el aletidir. Scribe persona, arşiv-tarama görevi aldığında bu skill'i çağırır. Diğer alakalı personalar:
|
||||
- `persona-frodo-russia` — Sovyet/Rus dönem analizi için hit'leri yorumlar
|
||||
- `persona-centurion` — Askeri/savaş haberleri (1941-42 Doğu Cephesi)
|
||||
- `persona-polyglot-russian` — Yunanca/Ermenice gazeteler aktive olduğunda
|
||||
|
||||
## Bilinen geliştirme alanları
|
||||
|
||||
- [ ] Yunanca/Ermenice OCR re-pass (Tesseract 5)
|
||||
- [ ] Hit-level Tesseract doğrulaması (yanlış pozitif azaltma)
|
||||
- [ ] Dataview view'ı (Obsidian'da hit listesi sortlanabilir)
|
||||
- [ ] Kütüphaneye yazılı bilgi notu (büyük tarama öncesi)
|
||||
@@ -0,0 +1,38 @@
|
||||
# EKOS Gazete Arama — Keyword Set Template
|
||||
# Yeni konu eklerken bu dosyayı kopyala: cp _template.yaml <topic>.yaml
|
||||
#
|
||||
# Şema açıklaması:
|
||||
# - canonical: aramada görüntülenecek "doğru" yazım (raporda bu görünür)
|
||||
# - aliases: aynı kavramın diğer yazımları (transliterasyon, eski Türkçe, yabancı dil)
|
||||
# - suffixes: opsiyonel — Türkçe ek toleransı (Kırım+lı, Kırım+da, ...)
|
||||
# - weight: hit önemi (1=zayıf sinyal, 5=smoking gun). Rapor sıralaması bunu kullanır.
|
||||
# - notes: bağlam (raporda görünmez)
|
||||
|
||||
topic: example
|
||||
description: "Konu kısa açıklaması — raporun başlığında görünür"
|
||||
|
||||
# 1. Ana terimler — geniş, kavram seviyesi
|
||||
keywords:
|
||||
- canonical: "Örnek"
|
||||
aliases: ["Example", "Beispiel"]
|
||||
weight: 3
|
||||
notes: "Genel terim"
|
||||
|
||||
# 2. Özel isimler — kişiler, yerler (smoking gun)
|
||||
proper_nouns:
|
||||
- canonical: "Mustafa Kemal"
|
||||
aliases: ["Gazi", "Atatürk"]
|
||||
weight: 5
|
||||
|
||||
# 3. Disambiguator — false positive filtre
|
||||
# Eğer match'in ±20 karakter çevresinde bu terimler varsa hit reddedilir
|
||||
disambiguators:
|
||||
- "Kerim Bey" # "Kırım" ile karışan özel ad
|
||||
- "Kerime Hanım"
|
||||
|
||||
# 4. Öncelikli zaman pencereleri — bu yıllarda hit'ler önce taranır + raporda öne çıkar
|
||||
priority_windows:
|
||||
- start: "1932-01-01"
|
||||
end: "1933-12-31"
|
||||
reason: "Açıklama"
|
||||
weight: 5
|
||||
286
personas/_shared/skills/ekos-gazete-search/keywords/kirim.yaml
Normal file
286
personas/_shared/skills/ekos-gazete-search/keywords/kirim.yaml
Normal file
@@ -0,0 +1,286 @@
|
||||
# EKOS Gazete Arama — Kırım Keyword Set
|
||||
# Kapsam: Kırım Hanlığı dönemi mirasından 1942'ye kadar Türk basınında
|
||||
# Kırım coğrafyası, halkı, diasporası, Sovyet dönemi, ve siyasi figürleri.
|
||||
|
||||
topic: Kirim
|
||||
description: "Kırım — Hanlık mirası, Tatar halkı, diaspora, Sovyet dönemi (1928-1942)"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 1. ANA TERİMLER — coğrafya ve kavram (geniş, weight 3-4)
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
keywords:
|
||||
# Coğrafi temel
|
||||
- canonical: "Kırım"
|
||||
aliases: ["Crimea", "Krim", "Krym", "Krymea", "Crimee", "La Crimee"]
|
||||
suffixes: ["lı", "lılar", "dan", "ya", "a", "da", "de", "i", "ın", "ı"]
|
||||
weight: 4
|
||||
notes: "Ana terim. OCR'da K1r1m, Kirim varyantları hakim."
|
||||
|
||||
- canonical: "Kırım Hanlığı"
|
||||
aliases: ["Khanate of Crimea", "Crimean Khanate"]
|
||||
weight: 5
|
||||
notes: "Tarihsel devlet (1441-1783). Hanlık nostaljisi 1930'larda diaspora söyleminde aktif."
|
||||
|
||||
- canonical: "Kırım Yarımadası"
|
||||
aliases: ["Crimean Peninsula", "Tauride"]
|
||||
weight: 4
|
||||
|
||||
- canonical: "Kırım Türkleri"
|
||||
aliases: ["Kırım Tatarları", "Crimean Tatars", "Krimtataren", "Tatars de Crimee"]
|
||||
weight: 5
|
||||
notes: "Diaspora söyleminde 'Türk' kelimesi 'Tatar' yerine sık kullanıldı"
|
||||
|
||||
- canonical: "Tatar"
|
||||
aliases: ["Tatarlar", "Tatarların", "Tatare", "Tartar"]
|
||||
weight: 2
|
||||
notes: "WEIGHT DÜŞÜK — çok geniş hit verecek (Kazan Tatarı, Sibirya Tatarı vs). Disambiguator gerekir."
|
||||
|
||||
# Kırım şehirleri ve coğrafi noktalar
|
||||
- canonical: "Bahçesaray"
|
||||
aliases: ["Bahçe-saray", "Bagcesaray", "Bachtschisaraj", "Bakhchisaray", "Bakhchysaray"]
|
||||
weight: 5
|
||||
notes: "Hanlık başkenti — geçtiyse %100 Kırım bağlamı"
|
||||
|
||||
- canonical: "Akmescit"
|
||||
aliases: ["Ak-mescit", "Akmesçit", "Simferopol", "Симферополь", "Simferopole"]
|
||||
weight: 5
|
||||
|
||||
- canonical: "Kefe"
|
||||
aliases: ["Caffa", "Theodosia", "Feodosiya", "Feodosia", "Theodosie"]
|
||||
weight: 5
|
||||
notes: "Eski Ceneviz/Osmanlı liman şehri. 'kefil/kefa' ile çakışmaya dikkat — disambiguator zorunlu."
|
||||
|
||||
- canonical: "Gözleve"
|
||||
aliases: ["Yevpatoriya", "Eupatoria", "Yevpatoria"]
|
||||
weight: 5
|
||||
|
||||
- canonical: "Sivastopol"
|
||||
aliases: ["Sebastopol", "Sevastopol", "Sevastopolj", "Sevastopole"]
|
||||
weight: 4
|
||||
notes: "1854-55 Kırım Savaşı'nda meşhur, 1942'de Alman kuşatması"
|
||||
|
||||
- canonical: "Kerç"
|
||||
aliases: ["Kertsch", "Kerch", "Керчь", "Kerč"]
|
||||
weight: 4
|
||||
notes: "1941-42 Doğu Cephesi'nde stratejik"
|
||||
|
||||
- canonical: "Yalta"
|
||||
aliases: ["Jalta", "Ялта"]
|
||||
weight: 4
|
||||
|
||||
- canonical: "Çatırdağ"
|
||||
aliases: ["Çatır Dağı", "Chatyr-Dag", "Tschatyr-Dag"]
|
||||
weight: 5
|
||||
notes: "Kırım Tatar şiir/hatıra geleneğinde sembol — diaspora yazılarının imzası"
|
||||
|
||||
- canonical: "Or Kapı"
|
||||
aliases: ["Orkapı", "Perekop", "Перекоп"]
|
||||
weight: 5
|
||||
notes: "Kırım'a giriş kapısı; askeri haberlerin merkezi (1920 İç Savaş, 1941-42)"
|
||||
|
||||
- canonical: "Karasubazar"
|
||||
aliases: ["Karasu Bazar", "Karasubazaar", "Belogorsk"]
|
||||
weight: 5
|
||||
|
||||
- canonical: "Kezlev"
|
||||
aliases: ["Yevpatoria", "Kozlov"]
|
||||
weight: 4
|
||||
|
||||
# Tarihsel / siyasi kavramlar
|
||||
- canonical: "Kırım Muhtar Cumhuriyeti"
|
||||
aliases: ["Crimean ASSR", "Krimskaja ASSR", "Кримська АРСР", "Crimean Autonomous"]
|
||||
weight: 5
|
||||
notes: "1921'de kurulan Sovyet özerk cumhuriyeti — 1928-1942 arası tüm Kırım haberinin idari bağlamı"
|
||||
|
||||
- canonical: "Milli Fırka"
|
||||
aliases: ["Millî Fırka", "Milli Firka", "Kırım Milli Fırkası"]
|
||||
weight: 5
|
||||
notes: "Numan Çelebi Cihan'ın partisi — diaspora yazılarında smoking gun"
|
||||
|
||||
- canonical: "Kurultay"
|
||||
aliases: ["Kırım Kurultayı"]
|
||||
weight: 4
|
||||
notes: "1917 Kurultay'ı, Kazan Kurultay'ı ile karışabilir — bağlam denetimi gerekli"
|
||||
|
||||
- canonical: "muhacir"
|
||||
aliases: ["muhacirin", "muhacirler", "mültecilik", "mülteci"]
|
||||
weight: 2
|
||||
notes: "Genel terim ama Kırım göçü konusunda yoğun. Düşük weight + bağlam."
|
||||
|
||||
# Kırım Savaşı (tarihsel referans olarak gazetelerde geçer)
|
||||
- canonical: "Kırım Savaşı"
|
||||
aliases: ["Kırım Harbi", "Crimean War", "Krimkrieg", "Guerre de Crimee"]
|
||||
weight: 4
|
||||
notes: "1853-56. Tarihsel makaleler 1928-1942 boyunca düzenli."
|
||||
|
||||
# Sovyet dönem terminoloji
|
||||
- canonical: "kollektivizasyon"
|
||||
aliases: ["kollektifleştirme", "kolhoz", "sovhoz", "kolxoz"]
|
||||
weight: 2
|
||||
notes: "Geniş Sovyet bağlamı; Kırım haberleriyle birlikte gelirse weight artar"
|
||||
|
||||
- canonical: "açlık"
|
||||
aliases: ["kıtlık", "ac11k", "kit11k"]
|
||||
weight: 1
|
||||
notes: "Çok geniş — sadece Kırım/Sovyet ile yakınsa anlamlı"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 2. ÖZEL İSİMLER — kişiler (smoking gun, weight 5)
|
||||
# Bir gazete sayfasında bu isimlerden biri geçtiyse Kırım içeriği %95 garanti.
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
proper_nouns:
|
||||
# Kırım Tatar siyasi liderleri
|
||||
- canonical: "Numan Çelebi Cihan"
|
||||
aliases: ["Noman Çelebicihan", "Numan Çelebicihan", "Çelebi Cihan", "Celebi Cihan"]
|
||||
weight: 5
|
||||
notes: "Kırım Müslüman Demokratik Cumhuriyeti kurucusu (1917), Bolşeviklerce öldürüldü 1918"
|
||||
|
||||
- canonical: "Cafer Seydahmet"
|
||||
aliases: ["Cafer Seyit Ahmet", "Cafer Seydamet", "Seydahmet Kırımer", "Cafer Kırımer", "Cafer Seyid Ahmet"]
|
||||
weight: 5
|
||||
notes: "İstanbul'da Kırım diasporasının lideri; 1928-1942 arası aktif yazar"
|
||||
|
||||
- canonical: "Müstecip Ülküsal"
|
||||
aliases: ["Mustecip Ulkusal", "Müstecip Hacı Fazıl", "Ülküsal"]
|
||||
weight: 5
|
||||
notes: "Romanya/Köstence merkezli Kırım Tatar lideri, 'Emel' dergisi"
|
||||
|
||||
- canonical: "Hamdullah Suphi"
|
||||
aliases: ["Hamdullah Suphi Tanrıöver", "Tanrıöver"]
|
||||
weight: 4
|
||||
notes: "Türk Ocakları reisi, Kırım/Romanya muhaceretiyle ilgili devlet adamı"
|
||||
|
||||
- canonical: "Yusuf Akçura"
|
||||
aliases: ["Yusuf Akçuraoğlu", "Akçura", "Akcura"]
|
||||
weight: 4
|
||||
notes: "Kazan Tatarı ama Türkçü/Tatar dünyasının ortak figürü"
|
||||
|
||||
- canonical: "İsmail Gaspıralı"
|
||||
aliases: ["İsmail Bey Gaspıralı", "Gasprinski", "Gasprinsky", "Ismail Gaspirali"]
|
||||
weight: 5
|
||||
notes: "Tercüman gazetesi yayıncısı, Türkçülüğün babası — anma yazıları sık"
|
||||
|
||||
- canonical: "Veli İbrahim"
|
||||
aliases: ["Veli Ibraimov", "Veli Ibrahim"]
|
||||
weight: 5
|
||||
notes: "Kırım Muhtar Cumhuriyeti başkanı, 1928'de Stalin tarafından idam"
|
||||
|
||||
- canonical: "Bekir Çobanzade"
|
||||
aliases: ["Bekir Çoban-zade", "Çobanzade", "Cobanzade"]
|
||||
weight: 5
|
||||
notes: "Kırım Tatar dilbilimci, 1937'de tasfiye edildi"
|
||||
|
||||
- canonical: "Mehmet Niyazi"
|
||||
aliases: ["Memet Niyazi", "Mehmed Niyazi"]
|
||||
weight: 4
|
||||
notes: "Romanya/Köstence Kırım Tatar şairi"
|
||||
|
||||
- canonical: "Habibullah Kerimi"
|
||||
aliases: ["Habibullah Karimi", "Kerimi"]
|
||||
weight: 4
|
||||
|
||||
- canonical: "Asan Sabri Ayvaz"
|
||||
aliases: ["Asan Sabri Ayvazov", "Ayvazov", "Sabri Ayvazov"]
|
||||
weight: 5
|
||||
notes: "Kırım Tatar yazar, 1937'de Stalin terörü kurbanı"
|
||||
|
||||
- canonical: "Reşit Mediyev"
|
||||
aliases: ["Reşit Mediev", "Mediyev", "Medief"]
|
||||
weight: 5
|
||||
|
||||
# Sovyet/Rus tarafı (Kırım'la doğrudan iş tutmuş)
|
||||
- canonical: "Stalin"
|
||||
aliases: ["Staline", "Сталин"]
|
||||
weight: 1
|
||||
notes: "Çok geniş; sadece Kırım/Tatar ile co-occurring olduğunda anlamlı"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 3. DİSAMBİGÜATÖRLER — false positive filtreleri
|
||||
# Bir match'in ±50 karakter çevresinde bu kelime varsa hit reddedilir
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
disambiguators:
|
||||
# "Kırım" ↔ "Kerim" (özel isim) çakışması
|
||||
- "Kerim Bey"
|
||||
- "Kerim Pa~a" # Kerim Paşa OCR
|
||||
- "Kerim Pasa"
|
||||
- "Kerime Hanim"
|
||||
- "Kerime Han1m"
|
||||
- "Kerim Efendi"
|
||||
- "Abdulkerim"
|
||||
- "Abdiilkerim" # OCR varyantı
|
||||
# "Kefe" (Crimea) ↔ "kefil/kefe" (sigorta/teminat)
|
||||
- "kefil"
|
||||
- "kefalet"
|
||||
- "kefaleten"
|
||||
# "Tatar" yiyecekler
|
||||
- "tatar boregi"
|
||||
- "tatar boregi"
|
||||
- "tatar pidesi"
|
||||
- "tatar sosu"
|
||||
# "Yalta" ↔ Türkçe "yalta" yok; "yaltak" var
|
||||
- "yaltakl"
|
||||
- "yaltaklan"
|
||||
# 1932 Türk Dili Kurultayı / Türk Tarih Kurultayı false positive'leri (POC iter-1 öğrendik)
|
||||
# "Kurultay" tek başına Kırım için yetersiz; bu kombinler Atatürk dönemi reformları
|
||||
- "Türk Dili Kurultayı"
|
||||
- "Türk Dili Kurultay"
|
||||
- "Tiirk Dili Kurultayi" # OCR varyantı
|
||||
- "Dil Kurultayı"
|
||||
- "Dil Kurultay"
|
||||
- "Türk Tarih Kurultayı"
|
||||
- "Türk Tarih Kurultay"
|
||||
- "Tarih Kurultay"
|
||||
- "tarih kurultay"
|
||||
- "Halkevi Kurultay"
|
||||
- "halkevleri kurultay"
|
||||
- "C.H.F. Kurultay" # Cumhuriyet Halk Fırkası Kurultayı
|
||||
- "C.H.P. Kurultay"
|
||||
- "Fırka Kurultay"
|
||||
- "Parti Kurultay"
|
||||
# Kefe varyantları (genel "kıfayet/kifaye" OCR çöplüğü)
|
||||
- "kifayet"
|
||||
- "kifayetli"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 4. ÖNCELİKLİ ZAMAN PENCERELERİ
|
||||
# Bu pencerelerdeki sayılar önce taranır, raporda öne çıkar
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
priority_windows:
|
||||
- start: "1928-01-01"
|
||||
end: "1928-12-31"
|
||||
weight: 4
|
||||
reason: "Veli İbrahim idamı + Kırım Tatar tasfiyesinin başlangıcı"
|
||||
|
||||
- start: "1932-01-01"
|
||||
end: "1933-12-31"
|
||||
weight: 5
|
||||
reason: "Holodomor / Kırım açlığı — Sovyet kıtlığının zirvesi"
|
||||
|
||||
- start: "1936-01-01"
|
||||
end: "1938-06-30"
|
||||
weight: 5
|
||||
reason: "Stalin Büyük Terör — Çobanzade, Ayvazov, Bekirov tasfiyeleri"
|
||||
|
||||
- start: "1939-08-23"
|
||||
end: "1941-06-22"
|
||||
weight: 4
|
||||
reason: "Molotov-Ribbentrop dönemi; Sovyet politikasında diaspora söylemi"
|
||||
|
||||
- start: "1941-06-22"
|
||||
end: "1942-12-31"
|
||||
weight: 5
|
||||
reason: "Alman Doğu Cephesi ilerleyişi; Kırım'ın Wehrmacht tarafından işgali"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 5. CO-OCCURRENCE BOOST — birlikte geçerse hit ağırlığı artar
|
||||
# (lib/fuzzy.py içinde proximity score için)
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
co_occurrence_boost:
|
||||
# Bu çiftler aynı paragrafta (±300 char) geçerse weight +2
|
||||
- ["Kırım", "Tatar"]
|
||||
- ["Kırım", "muhacir"]
|
||||
- ["Sovyet", "Kırım"]
|
||||
- ["Kırım", "açlık"]
|
||||
- ["Kırım", "kollektivizasyon"]
|
||||
- ["Tatar", "Bahçesaray"]
|
||||
- ["Stalin", "Kırım"]
|
||||
@@ -0,0 +1,297 @@
|
||||
topic: KirimCore
|
||||
description: Kırım — sadece toponym ve Kırım-prefix kavramlar (dar tarama)
|
||||
keywords:
|
||||
- canonical: Kırım
|
||||
aliases:
|
||||
- Crimea
|
||||
- Krim
|
||||
- Krym
|
||||
- Krymea
|
||||
- Crimee
|
||||
- La Crimee
|
||||
suffixes:
|
||||
- lı
|
||||
- lılar
|
||||
- dan
|
||||
- ya
|
||||
- a
|
||||
- da
|
||||
- de
|
||||
- i
|
||||
- ın
|
||||
- ı
|
||||
weight: 4
|
||||
notes: Ana terim. OCR'da K1r1m, Kirim varyantları hakim.
|
||||
- canonical: Kırım Hanlığı
|
||||
aliases:
|
||||
- Khanate of Crimea
|
||||
- Crimean Khanate
|
||||
weight: 5
|
||||
notes: Tarihsel devlet (1441-1783). Hanlık nostaljisi 1930'larda diaspora söyleminde aktif.
|
||||
- canonical: Kırım Yarımadası
|
||||
aliases:
|
||||
- Crimean Peninsula
|
||||
- Tauride
|
||||
weight: 4
|
||||
- canonical: Kırım Türkleri
|
||||
aliases:
|
||||
- Kırım Tatarları
|
||||
- Crimean Tatars
|
||||
- Krimtataren
|
||||
- Tatars de Crimee
|
||||
weight: 5
|
||||
notes: Diaspora söyleminde 'Türk' kelimesi 'Tatar' yerine sık kullanıldı
|
||||
- canonical: Bahçesaray
|
||||
aliases:
|
||||
- Bahçe-saray
|
||||
- Bagcesaray
|
||||
- Bachtschisaraj
|
||||
- Bakhchisaray
|
||||
- Bakhchysaray
|
||||
weight: 5
|
||||
notes: Hanlık başkenti — geçtiyse %100 Kırım bağlamı
|
||||
- canonical: Akmescit
|
||||
aliases:
|
||||
- Ak-mescit
|
||||
- Akmesçit
|
||||
- Simferopol
|
||||
- Симферополь
|
||||
- Simferopole
|
||||
weight: 5
|
||||
- canonical: Kefe
|
||||
aliases:
|
||||
- Caffa
|
||||
- Theodosia
|
||||
- Feodosiya
|
||||
- Feodosia
|
||||
- Theodosie
|
||||
weight: 5
|
||||
notes: Eski Ceneviz/Osmanlı liman şehri. 'kefil/kefa' ile çakışmaya dikkat — disambiguator zorunlu.
|
||||
- canonical: Gözleve
|
||||
aliases:
|
||||
- Yevpatoriya
|
||||
- Eupatoria
|
||||
- Yevpatoria
|
||||
weight: 5
|
||||
- canonical: Sivastopol
|
||||
aliases:
|
||||
- Sebastopol
|
||||
- Sevastopol
|
||||
- Sevastopolj
|
||||
- Sevastopole
|
||||
weight: 4
|
||||
notes: 1854-55 Kırım Savaşı'nda meşhur, 1942'de Alman kuşatması
|
||||
- canonical: Kerç
|
||||
aliases:
|
||||
- Kertsch
|
||||
- Kerch
|
||||
- Керчь
|
||||
- Kerč
|
||||
weight: 4
|
||||
notes: 1941-42 Doğu Cephesi'nde stratejik
|
||||
- canonical: Yalta
|
||||
aliases:
|
||||
- Jalta
|
||||
- Ялта
|
||||
weight: 4
|
||||
- canonical: Çatırdağ
|
||||
aliases:
|
||||
- Çatır Dağı
|
||||
- Chatyr-Dag
|
||||
- Tschatyr-Dag
|
||||
weight: 5
|
||||
notes: Kırım Tatar şiir/hatıra geleneğinde sembol — diaspora yazılarının imzası
|
||||
- canonical: Or Kapı
|
||||
aliases:
|
||||
- Orkapı
|
||||
- Perekop
|
||||
- Перекоп
|
||||
weight: 5
|
||||
notes: Kırım'a giriş kapısı; askeri haberlerin merkezi (1920 İç Savaş, 1941-42)
|
||||
- canonical: Karasubazar
|
||||
aliases:
|
||||
- Karasu Bazar
|
||||
- Karasubazaar
|
||||
- Belogorsk
|
||||
weight: 5
|
||||
- canonical: Kezlev
|
||||
aliases:
|
||||
- Yevpatoria
|
||||
- Kozlov
|
||||
weight: 4
|
||||
- canonical: Kırım Muhtar Cumhuriyeti
|
||||
aliases:
|
||||
- Crimean ASSR
|
||||
- Krimskaja ASSR
|
||||
- Кримська АРСР
|
||||
- Crimean Autonomous
|
||||
weight: 5
|
||||
notes: 1921'de kurulan Sovyet özerk cumhuriyeti — 1928-1942 arası tüm Kırım haberinin idari bağlamı
|
||||
- canonical: Kırım Savaşı
|
||||
aliases:
|
||||
- Kırım Harbi
|
||||
- Crimean War
|
||||
- Krimkrieg
|
||||
- Guerre de Crimee
|
||||
weight: 4
|
||||
notes: 1853-56. Tarihsel makaleler 1928-1942 boyunca düzenli.
|
||||
proper_nouns:
|
||||
- canonical: Numan Çelebi Cihan
|
||||
aliases:
|
||||
- Noman Çelebicihan
|
||||
- Numan Çelebicihan
|
||||
- Çelebi Cihan
|
||||
- Celebi Cihan
|
||||
weight: 5
|
||||
notes: Kırım Müslüman Demokratik Cumhuriyeti kurucusu (1917), Bolşeviklerce öldürüldü 1918
|
||||
- canonical: Cafer Seydahmet
|
||||
aliases:
|
||||
- Cafer Seyit Ahmet
|
||||
- Cafer Seydamet
|
||||
- Seydahmet Kırımer
|
||||
- Cafer Kırımer
|
||||
- Cafer Seyid Ahmet
|
||||
weight: 5
|
||||
notes: İstanbul'da Kırım diasporasının lideri; 1928-1942 arası aktif yazar
|
||||
- canonical: Müstecip Ülküsal
|
||||
aliases:
|
||||
- Mustecip Ulkusal
|
||||
- Müstecip Hacı Fazıl
|
||||
- Ülküsal
|
||||
weight: 5
|
||||
notes: Romanya/Köstence merkezli Kırım Tatar lideri, 'Emel' dergisi
|
||||
- canonical: Hamdullah Suphi
|
||||
aliases:
|
||||
- Hamdullah Suphi Tanrıöver
|
||||
- Tanrıöver
|
||||
weight: 4
|
||||
notes: Türk Ocakları reisi, Kırım/Romanya muhaceretiyle ilgili devlet adamı
|
||||
- canonical: Yusuf Akçura
|
||||
aliases:
|
||||
- Yusuf Akçuraoğlu
|
||||
- Akçura
|
||||
- Akcura
|
||||
weight: 4
|
||||
notes: Kazan Tatarı ama Türkçü/Tatar dünyasının ortak figürü
|
||||
- canonical: İsmail Gaspıralı
|
||||
aliases:
|
||||
- İsmail Bey Gaspıralı
|
||||
- Gasprinski
|
||||
- Gasprinsky
|
||||
- Ismail Gaspirali
|
||||
weight: 5
|
||||
notes: Tercüman gazetesi yayıncısı, Türkçülüğün babası — anma yazıları sık
|
||||
- canonical: Veli İbrahim
|
||||
aliases:
|
||||
- Veli Ibraimov
|
||||
- Veli Ibrahim
|
||||
weight: 5
|
||||
notes: Kırım Muhtar Cumhuriyeti başkanı, 1928'de Stalin tarafından idam
|
||||
- canonical: Bekir Çobanzade
|
||||
aliases:
|
||||
- Bekir Çoban-zade
|
||||
- Çobanzade
|
||||
- Cobanzade
|
||||
weight: 5
|
||||
notes: Kırım Tatar dilbilimci, 1937'de tasfiye edildi
|
||||
- canonical: Mehmet Niyazi
|
||||
aliases:
|
||||
- Memet Niyazi
|
||||
- Mehmed Niyazi
|
||||
weight: 4
|
||||
notes: Romanya/Köstence Kırım Tatar şairi
|
||||
- canonical: Habibullah Kerimi
|
||||
aliases:
|
||||
- Habibullah Karimi
|
||||
- Kerimi
|
||||
weight: 4
|
||||
- canonical: Asan Sabri Ayvaz
|
||||
aliases:
|
||||
- Asan Sabri Ayvazov
|
||||
- Ayvazov
|
||||
- Sabri Ayvazov
|
||||
weight: 5
|
||||
notes: Kırım Tatar yazar, 1937'de Stalin terörü kurbanı
|
||||
- canonical: Reşit Mediyev
|
||||
aliases:
|
||||
- Reşit Mediev
|
||||
- Mediyev
|
||||
- Medief
|
||||
weight: 5
|
||||
- canonical: Stalin
|
||||
aliases:
|
||||
- Staline
|
||||
- Сталин
|
||||
weight: 1
|
||||
notes: Çok geniş; sadece Kırım/Tatar ile co-occurring olduğunda anlamlı
|
||||
disambiguators:
|
||||
- Kerim Bey
|
||||
- Kerim Pa~a
|
||||
- Kerim Pasa
|
||||
- Kerime Hanim
|
||||
- Kerime Han1m
|
||||
- Kerim Efendi
|
||||
- Abdulkerim
|
||||
- Abdiilkerim
|
||||
- kefil
|
||||
- kefalet
|
||||
- kefaleten
|
||||
- tatar boregi
|
||||
- tatar boregi
|
||||
- tatar pidesi
|
||||
- tatar sosu
|
||||
- yaltakl
|
||||
- yaltaklan
|
||||
- Türk Dili Kurultayı
|
||||
- Türk Dili Kurultay
|
||||
- Tiirk Dili Kurultayi
|
||||
- Dil Kurultayı
|
||||
- Dil Kurultay
|
||||
- Türk Tarih Kurultayı
|
||||
- Türk Tarih Kurultay
|
||||
- Tarih Kurultay
|
||||
- tarih kurultay
|
||||
- Halkevi Kurultay
|
||||
- halkevleri kurultay
|
||||
- C.H.F. Kurultay
|
||||
- C.H.P. Kurultay
|
||||
- Fırka Kurultay
|
||||
- Parti Kurultay
|
||||
- kifayet
|
||||
- kifayetli
|
||||
priority_windows:
|
||||
- start: '1928-01-01'
|
||||
end: '1928-12-31'
|
||||
weight: 4
|
||||
reason: Veli İbrahim idamı + Kırım Tatar tasfiyesinin başlangıcı
|
||||
- start: '1932-01-01'
|
||||
end: '1933-12-31'
|
||||
weight: 5
|
||||
reason: Holodomor / Kırım açlığı — Sovyet kıtlığının zirvesi
|
||||
- start: '1936-01-01'
|
||||
end: '1938-06-30'
|
||||
weight: 5
|
||||
reason: Stalin Büyük Terör — Çobanzade, Ayvazov, Bekirov tasfiyeleri
|
||||
- start: '1939-08-23'
|
||||
end: '1941-06-22'
|
||||
weight: 4
|
||||
reason: Molotov-Ribbentrop dönemi; Sovyet politikasında diaspora söylemi
|
||||
- start: '1941-06-22'
|
||||
end: '1942-12-31'
|
||||
weight: 5
|
||||
reason: Alman Doğu Cephesi ilerleyişi; Kırım'ın Wehrmacht tarafından işgali
|
||||
co_occurrence_boost:
|
||||
- - Kırım
|
||||
- Tatar
|
||||
- - Kırım
|
||||
- muhacir
|
||||
- - Sovyet
|
||||
- Kırım
|
||||
- - Kırım
|
||||
- açlık
|
||||
- - Kırım
|
||||
- kollektivizasyon
|
||||
- - Tatar
|
||||
- Bahçesaray
|
||||
- - Stalin
|
||||
- Kırım
|
||||
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build a master manifest of every PDF in the EKOS gazette archive.
|
||||
|
||||
Fetches each gazette.php?gazete=<slug> page once, extracts all PDF
|
||||
hrefs, and writes them into manifests/ekos_master.csv. This is a
|
||||
one-time operation (~5 minutes); the resulting CSV drives subsequent
|
||||
search runs.
|
||||
"""
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import requests
|
||||
|
||||
BASE = "https://nek.istanbul.edu.tr/ekos/GAZETE/"
|
||||
HERE = Path(__file__).resolve().parent.parent
|
||||
|
||||
# 53 newspaper slugs discovered during recon (2026-04-28)
|
||||
SLUGS = [
|
||||
"aciksoz", "aksam", "anadolu", "apoyevmatini", "aravelk", "aydin",
|
||||
"beyoglu", "borsa", "bugun", "cerideihavadis", "cumhuriyet", "dogu",
|
||||
"ensondakika", "ensonhavadis", "haber", "hakikat", "hakimiyetimilliye",
|
||||
"hakkinsesi", "halkindili", "halkinsesi", "ikdam", "ikdamhalk",
|
||||
"ikdamsabahpostasi", "istanbul", "izmirpostasi", "jamanak", "kurun",
|
||||
"leechodebelgrade", "metapolitefsis", "milliyet", "munakasa",
|
||||
"piyasacetveli", "savas", "sondakika", "sonposta", "sonsaat",
|
||||
"sontelgraf", "tan", "tasviriefkar", "turkdili", "turkischepost",
|
||||
"turksozu", "ulus", "ulusalbirlik", "ulussesi", "vakit", "vatan",
|
||||
"yarin", "yeniasir", "yenigun", "yenimersin", "yenisabah", "yeniyol",
|
||||
]
|
||||
|
||||
# /<slug>/<slug>_<year>/<slug>_<year>_<month>_/<slug>_<year>_<month>_<day>_.pdf
|
||||
PDF_HREF_RE = re.compile(r'href="([^"]+\.pdf)"', re.IGNORECASE)
|
||||
DATE_RE = re.compile(
|
||||
r'/([a-z][a-z0-9]+)_(\d{4})_([a-z]+?)_(\d+)_?\.pdf',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
UA = {"User-Agent": "Mozilla/5.0 (research; ekos-gazete-search; "
|
||||
"contact: kutuphane@istanbul.edu.tr)"}
|
||||
|
||||
|
||||
def normalize_url(href: str) -> str:
|
||||
if href.startswith("http"):
|
||||
return href
|
||||
if href.startswith("/"):
|
||||
return "https://nek.istanbul.edu.tr" + href
|
||||
# remove leading "../" or "./"
|
||||
href = re.sub(r'^\.+/', '', href)
|
||||
return BASE + href
|
||||
|
||||
|
||||
def fetch_slug(slug: str, throttle: float = 1.0):
|
||||
url = f"{BASE}gazete.php?gazete={slug}"
|
||||
print(f" → {slug}", end=" ", flush=True)
|
||||
try:
|
||||
r = requests.get(url, headers=UA, timeout=30)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"FAIL: {e}")
|
||||
return []
|
||||
|
||||
rows = []
|
||||
for href in PDF_HREF_RE.findall(r.text):
|
||||
m = DATE_RE.search(href)
|
||||
if not m:
|
||||
continue
|
||||
s, year, month, day = m.groups()
|
||||
rows.append({
|
||||
"slug": s.lower(),
|
||||
"year": year,
|
||||
"month": month.lower(),
|
||||
"day": day,
|
||||
"url": normalize_url(href),
|
||||
})
|
||||
print(f"{len(rows)} PDFs")
|
||||
time.sleep(throttle)
|
||||
return rows
|
||||
|
||||
|
||||
def main():
|
||||
out_path = HERE / "manifests" / "ekos_master.csv"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Fetching {len(SLUGS)} gazette pages → {out_path}")
|
||||
print(f"Throttle: 1s/req, expected runtime ~1 minute")
|
||||
print()
|
||||
|
||||
all_rows = []
|
||||
for slug in SLUGS:
|
||||
all_rows.extend(fetch_slug(slug))
|
||||
|
||||
# De-dup (the catalogs occasionally repeat hrefs)
|
||||
seen = set()
|
||||
deduped = []
|
||||
for r in all_rows:
|
||||
key = r["url"]
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
deduped.append(r)
|
||||
|
||||
with out_path.open("w", newline="", encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=["slug", "year", "month", "day", "url"])
|
||||
w.writeheader()
|
||||
w.writerows(deduped)
|
||||
|
||||
print(f"\n✓ Manifest: {len(deduped)} unique PDFs → {out_path}")
|
||||
# Quick stats
|
||||
by_slug = {}
|
||||
for r in deduped:
|
||||
by_slug[r["slug"]] = by_slug.get(r["slug"], 0) + 1
|
||||
print(f"\nTop 10 by issue count:")
|
||||
for s, c in sorted(by_slug.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {s:>25} {c:>5}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Iterate the EKOS manifest, download each PDF, extract its text-layer,
|
||||
fuzzy-search against a YAML keyword set, and append hits to JSONL.
|
||||
|
||||
Storage policy:
|
||||
- Default: PDFs go to /tmp/ekos-cache/, processed, DELETED.
|
||||
- With --keep-pdfs DIR: PDFs that produce >=1 hit are MOVED to
|
||||
DIR/<slug>/<year>/<slug>_<year>_<month>_<day>.pdf for re-use.
|
||||
PDFs with zero hits are still deleted (content-driven curation).
|
||||
"""
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
HERE = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(HERE / "scripts"))
|
||||
from lib.fuzzy import (
|
||||
compile_keyword_set, compile_disambiguators,
|
||||
is_false_positive, extract_snippet
|
||||
)
|
||||
|
||||
UA = {"User-Agent": "Mozilla/5.0 (research; ekos-gazete-search)"}
|
||||
DEFAULT_CACHE = Path(os.environ.get("EKOS_CACHE", "/tmp/ekos-cache"))
|
||||
|
||||
|
||||
def in_priority_window(year: int, month: str, day: str, windows: list):
|
||||
"""Return (in_window: bool, weight: int, reason: str)."""
|
||||
# Map Turkish month slugs to numbers for comparison
|
||||
month_map = {
|
||||
"ocak": 1, "subat": 2, "şubat": 2, "mart": 3, "nisan": 4, "mayis": 5,
|
||||
"mayıs": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "ağustos": 8,
|
||||
"eylul": 9, "eylül": 9, "ekim": 10, "kasim": 11, "kasım": 11,
|
||||
"aralik": 12, "aralık": 12,
|
||||
}
|
||||
try:
|
||||
m = month_map.get(month.lower(), 1)
|
||||
d = int(day)
|
||||
from datetime import date
|
||||
cur = date(int(year), m, d)
|
||||
except Exception:
|
||||
return False, 0, None
|
||||
for w in windows:
|
||||
from datetime import datetime
|
||||
try:
|
||||
s = datetime.strptime(w["start"], "%Y-%m-%d").date()
|
||||
e = datetime.strptime(w["end"], "%Y-%m-%d").date()
|
||||
if s <= cur <= e:
|
||||
return True, w.get("weight", 3), w.get("reason", "")
|
||||
except Exception:
|
||||
continue
|
||||
return False, 0, None
|
||||
|
||||
|
||||
def pdftotext_page(pdf_path: Path, page: int, timeout: int = 30) -> str:
|
||||
"""Extract text from a single page using poppler-utils."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["pdftotext", "-layout", "-f", str(page), "-l", str(page),
|
||||
str(pdf_path), "-"],
|
||||
capture_output=True, text=True, timeout=timeout, errors="replace"
|
||||
)
|
||||
return r.stdout
|
||||
except subprocess.TimeoutExpired:
|
||||
return ""
|
||||
|
||||
|
||||
def get_page_count(pdf_path: Path) -> int:
|
||||
try:
|
||||
r = subprocess.run(["pdfinfo", str(pdf_path)],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
m = re.search(r"Pages:\s+(\d+)", r.stdout)
|
||||
return int(m.group(1)) if m else 1
|
||||
except Exception:
|
||||
return 1
|
||||
|
||||
|
||||
def process_pdf(row: dict, patterns: list, disambiguators: list,
|
||||
cache_dir: Path, priority_info: tuple,
|
||||
keep_dir: Path | None = None) -> list:
|
||||
"""Returns list of hit dicts (possibly empty).
|
||||
|
||||
If keep_dir is set and the PDF produces >=1 hit, the PDF is moved to
|
||||
keep_dir/<slug>/<year>/<basename>.pdf. Zero-hit PDFs are always deleted.
|
||||
"""
|
||||
slug, year, month, day, url = (row["slug"], row["year"], row["month"],
|
||||
row["day"], row["url"])
|
||||
pdf_path = cache_dir / f"{slug}_{year}_{month}_{day}.pdf"
|
||||
in_window, win_weight, win_reason = priority_info
|
||||
|
||||
# Download
|
||||
try:
|
||||
r = requests.get(url, headers=UA, timeout=120, stream=True)
|
||||
if r.status_code != 200:
|
||||
return []
|
||||
with pdf_path.open("wb") as f:
|
||||
for chunk in r.iter_content(8192):
|
||||
f.write(chunk)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
hits = []
|
||||
try:
|
||||
n_pages = get_page_count(pdf_path)
|
||||
for page in range(1, n_pages + 1):
|
||||
text = pdftotext_page(pdf_path, page)
|
||||
if len(text) < 50:
|
||||
continue
|
||||
# Search every compiled pattern against this page
|
||||
for label, weight, pat in patterns:
|
||||
for m in pat.finditer(text):
|
||||
if is_false_positive(text, m.start(), m.end(),
|
||||
disambiguators, window=200):
|
||||
continue
|
||||
snippet = extract_snippet(text, m.start(), m.end(), 200)
|
||||
final_weight = weight + (win_weight if in_window else 0)
|
||||
hits.append({
|
||||
"slug": slug, "year": year, "month": month,
|
||||
"day": day, "page": page,
|
||||
"keyword": label, "match": m.group(0),
|
||||
"snippet": snippet, "url": url,
|
||||
"weight": final_weight,
|
||||
"priority_window": in_window,
|
||||
"window_reason": win_reason if in_window else None,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" [error] {slug} {year}/{month}/{day}: {e}", file=sys.stderr)
|
||||
finally:
|
||||
try:
|
||||
if keep_dir is not None and hits:
|
||||
target_dir = keep_dir / slug / str(year)
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
target_path = target_dir / pdf_path.name
|
||||
shutil.move(str(pdf_path), str(target_path))
|
||||
for h in hits:
|
||||
h["local_pdf"] = str(target_path)
|
||||
else:
|
||||
pdf_path.unlink()
|
||||
except Exception as e:
|
||||
print(f" [retain error] {pdf_path}: {e}", file=sys.stderr)
|
||||
return hits
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument("--manifest", default=str(HERE / "manifests/ekos_master.csv"))
|
||||
ap.add_argument("--keywords", default=str(HERE / "keywords/kirim.yaml"))
|
||||
ap.add_argument("--out", default=str(HERE / "hits/kirim.jsonl"))
|
||||
ap.add_argument("--priority-only", action="store_true",
|
||||
help="Only process issues inside priority_windows")
|
||||
ap.add_argument("--year-from", type=int)
|
||||
ap.add_argument("--year-to", type=int)
|
||||
ap.add_argument("--slug", help="Restrict to gazette slug(s); comma-separated for multiple")
|
||||
ap.add_argument("--workers", type=int, default=4)
|
||||
ap.add_argument("--limit", type=int, help="Process at most N issues")
|
||||
ap.add_argument("--throttle", type=float, default=0.25,
|
||||
help="Seconds to sleep between job dispatches")
|
||||
ap.add_argument("--cache", default=str(DEFAULT_CACHE))
|
||||
ap.add_argument("--keep-pdfs", default=None,
|
||||
help="Move hit-producing PDFs into DIR/<slug>/<year>/ "
|
||||
"instead of deleting them. Zero-hit PDFs are still "
|
||||
"deleted (content-driven curation).")
|
||||
args = ap.parse_args()
|
||||
keep_dir = Path(args.keep_pdfs) if args.keep_pdfs else None
|
||||
if keep_dir:
|
||||
keep_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(f"PDF retention: hit-only → {keep_dir}/<slug>/<year>/")
|
||||
|
||||
# Load keywords
|
||||
with open(args.keywords) as f:
|
||||
keyword_data = yaml.safe_load(f)
|
||||
patterns = compile_keyword_set(keyword_data)
|
||||
disambiguators = compile_disambiguators(keyword_data)
|
||||
windows = keyword_data.get("priority_windows", [])
|
||||
|
||||
print(f"Compiled {len(patterns)} patterns, "
|
||||
f"{len(disambiguators)} disambiguators, "
|
||||
f"{len(windows)} priority windows")
|
||||
|
||||
cache_dir = Path(args.cache)
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = Path(args.out)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load + filter manifest
|
||||
slug_filter = set(s.strip() for s in args.slug.split(",")) if args.slug else None
|
||||
rows = []
|
||||
with open(args.manifest) as f:
|
||||
for r in csv.DictReader(f):
|
||||
if slug_filter and r["slug"] not in slug_filter:
|
||||
continue
|
||||
try:
|
||||
y = int(r["year"])
|
||||
except ValueError:
|
||||
continue
|
||||
if args.year_from and y < args.year_from:
|
||||
continue
|
||||
if args.year_to and y > args.year_to:
|
||||
continue
|
||||
in_w, _, _ = in_priority_window(y, r["month"], r["day"], windows)
|
||||
r["_in_window"] = in_w
|
||||
if args.priority_only and not in_w:
|
||||
continue
|
||||
rows.append(r)
|
||||
|
||||
# Sort priority-window first
|
||||
rows.sort(key=lambda r: (0 if r["_in_window"] else 1,
|
||||
r["year"], r["month"], r["day"]))
|
||||
if args.limit:
|
||||
rows = rows[:args.limit]
|
||||
|
||||
print(f"Processing {len(rows)} issues "
|
||||
f"({sum(1 for r in rows if r['_in_window'])} in priority windows)")
|
||||
print(f"Workers: {args.workers}, throttle: {args.throttle}s")
|
||||
print(f"Output: {out_path}")
|
||||
print()
|
||||
|
||||
start = time.time()
|
||||
n_hits = 0
|
||||
n_done = 0
|
||||
|
||||
def submit_job(executor, row):
|
||||
prio = in_priority_window(int(row["year"]), row["month"],
|
||||
row["day"], windows)
|
||||
return executor.submit(process_pdf, row, patterns, disambiguators,
|
||||
cache_dir, prio, keep_dir)
|
||||
|
||||
with out_path.open("a", encoding="utf-8") as out_f, \
|
||||
ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
# Interleaved submit+collect: keep ~workers*2 jobs in flight,
|
||||
# flush hits & log progress as each future completes (crash-safe).
|
||||
row_iter = iter(rows)
|
||||
|
||||
def submit_next():
|
||||
try:
|
||||
r = next(row_iter)
|
||||
except StopIteration:
|
||||
return None
|
||||
if args.throttle > 0:
|
||||
time.sleep(args.throttle)
|
||||
return submit_job(ex, r)
|
||||
|
||||
in_flight = set()
|
||||
for _ in range(args.workers * 2):
|
||||
f = submit_next()
|
||||
if f is None:
|
||||
break
|
||||
in_flight.add(f)
|
||||
|
||||
while in_flight:
|
||||
done, in_flight = wait(in_flight, return_when=FIRST_COMPLETED)
|
||||
for fut in done:
|
||||
try:
|
||||
hits = fut.result()
|
||||
except Exception as e:
|
||||
print(f" [worker error] {e}", file=sys.stderr)
|
||||
hits = []
|
||||
for h in hits:
|
||||
out_f.write(json.dumps(h, ensure_ascii=False) + "\n")
|
||||
if hits:
|
||||
out_f.flush()
|
||||
n_hits += len(hits)
|
||||
n_done += 1
|
||||
if n_done % 25 == 0:
|
||||
rate = n_done / (time.time() - start)
|
||||
eta = (len(rows) - n_done) / max(rate, 0.01)
|
||||
print(f" [{n_done}/{len(rows)}] hits={n_hits} "
|
||||
f"rate={rate:.1f}/s eta={eta/60:.1f}min",
|
||||
flush=True)
|
||||
f = submit_next()
|
||||
if f is not None:
|
||||
in_flight.add(f)
|
||||
|
||||
print(f"\n✓ Done in {(time.time()-start)/60:.1f}min: "
|
||||
f"{n_done} issues processed, {n_hits} hits → {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Render hits/<topic>.jsonl into Obsidian-friendly Markdown reports
|
||||
under 6-Geopolitics/Russia/03. HISTORICAL CONTEXT/ .
|
||||
|
||||
Output:
|
||||
EKOS-<Topic>-Bulgular.md — master, cross-year overview
|
||||
EKOS-<Topic>-<YYYY>.md — per-year detailed list
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
VAULT_BASE = Path("/home/salva/Obsidian/6-Geopolitics/Russia/03. HISTORICAL CONTEXT")
|
||||
HERE = Path(__file__).resolve().parent.parent
|
||||
|
||||
MONTH_TR = {
|
||||
"ocak": "Ocak", "subat": "Şubat", "mart": "Mart", "nisan": "Nisan",
|
||||
"mayis": "Mayıs", "haziran": "Haziran", "temmuz": "Temmuz",
|
||||
"agustos": "Ağustos", "eylul": "Eylül", "ekim": "Ekim",
|
||||
"kasim": "Kasım", "aralik": "Aralık",
|
||||
}
|
||||
|
||||
|
||||
def fmt_date(year: str, month: str, day: str) -> str:
|
||||
return f"{year}-{month}-{day:>02s}"
|
||||
|
||||
|
||||
def load_hits(path: Path) -> list:
|
||||
hits = []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
hits.append(json.loads(line))
|
||||
return hits
|
||||
|
||||
|
||||
def write_master(path: Path, hits: list, topic: str):
|
||||
by_year = defaultdict(list)
|
||||
for h in hits:
|
||||
by_year[h["year"]].append(h)
|
||||
|
||||
kw_counter = Counter(h["keyword"] for h in hits)
|
||||
slug_counter = Counter(h["slug"] for h in hits)
|
||||
|
||||
priority_hits = [h for h in hits if h.get("priority_window")]
|
||||
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
f.write(f"""---
|
||||
up:: [[Russia - PDF Library Index]]
|
||||
tag:: [[6.1-Geopolitical Analysis]]
|
||||
created:: {datetime.now().strftime('%Y-%m-%d')}
|
||||
topic:: {topic}
|
||||
total_hits:: {len(hits)}
|
||||
priority_hits:: {len(priority_hits)}
|
||||
source:: EKOS - İstanbul Üniversitesi NEK
|
||||
---
|
||||
|
||||
# EKOS — {topic} Bulguları (Master)
|
||||
|
||||
> **İstanbul Üniversitesi Nadir Eserler Kütüphanesi gazete arşivi (1928-1942)**
|
||||
> Toplam **{len(hits)} hit** — bunların **{len(priority_hits)}** tanesi öncelikli zaman pencerelerinde.
|
||||
> Tarama tarihi: {datetime.now().strftime('%Y-%m-%d')}
|
||||
|
||||
## Yıllara Göre Dağılım
|
||||
|
||||
| Yıl | Toplam Hit | Öncelik Hit | Yıllık Rapor |
|
||||
|---|---:|---:|---|
|
||||
""")
|
||||
for year in sorted(by_year):
|
||||
year_hits = by_year[year]
|
||||
prio_count = sum(1 for h in year_hits if h.get("priority_window"))
|
||||
link = f"EKOS-{topic}-{year}"
|
||||
f.write(f"| {year} | {len(year_hits)} | {prio_count} | [[{link}]] |\n")
|
||||
|
||||
f.write("\n## En Sık Geçen Anahtar Terimler\n\n")
|
||||
for kw, cnt in kw_counter.most_common(25):
|
||||
f.write(f"- **{kw}** — {cnt}\n")
|
||||
|
||||
f.write("\n## En Verimli Gazeteler\n\n")
|
||||
for slug, cnt in slug_counter.most_common(20):
|
||||
f.write(f"- `{slug}` — {cnt}\n")
|
||||
|
||||
# Top weighted hits (most likely smoking guns)
|
||||
f.write("\n## En Yüksek Skorlu 30 Hit (öncelikli inceleme)\n\n")
|
||||
top = sorted(hits, key=lambda h: -h.get("weight", 0))[:30]
|
||||
for h in top:
|
||||
date_str = fmt_date(h["year"], h["month"], h["day"])
|
||||
f.write(f"### {h['slug']} — {date_str} — sayfa {h['page']}\n\n")
|
||||
f.write(f"- **Kelime:** {h['keyword']} (match: `{h['match']}`)\n")
|
||||
f.write(f"- **Skor:** {h.get('weight', 0)}")
|
||||
if h.get("priority_window"):
|
||||
f.write(f" _(öncelikli pencere: {h.get('window_reason', '')})_")
|
||||
f.write(f"\n- **Kaynak:** [PDF]({h['url']})\n")
|
||||
f.write(f"- **Bağlam:**\n > {h['snippet']}\n\n")
|
||||
|
||||
f.write(f"\n---\n_Otomatik üretildi: ekos-gazete-search skill, {datetime.now().strftime('%Y-%m-%d %H:%M')}_\n")
|
||||
|
||||
|
||||
def write_yearly(path: Path, hits: list, year: str, topic: str, master_stem: str):
|
||||
by_date = defaultdict(list)
|
||||
for h in hits:
|
||||
by_date[fmt_date(h["year"], h["month"], h["day"])].append(h)
|
||||
|
||||
kw_counter = Counter(h["keyword"] for h in hits)
|
||||
priority_hits = [h for h in hits if h.get("priority_window")]
|
||||
window_reasons = set(h.get("window_reason") for h in hits if h.get("priority_window"))
|
||||
window_reasons.discard(None)
|
||||
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
f.write(f"""---
|
||||
up:: [[{master_stem}]]
|
||||
tag:: [[6.1-Geopolitical Analysis]]
|
||||
year:: {year}
|
||||
topic:: {topic}
|
||||
hit_count:: {len(hits)}
|
||||
priority_hits:: {len(priority_hits)}
|
||||
---
|
||||
|
||||
# EKOS — {topic} {year}
|
||||
|
||||
**Toplam hit:** {len(hits)}{f' — bunların {len(priority_hits)} tanesi öncelikli pencerede' if priority_hits else ''}.
|
||||
|
||||
""")
|
||||
if window_reasons:
|
||||
f.write("**Öncelikli pencereler bu yılda:**\n")
|
||||
for r in window_reasons:
|
||||
f.write(f"- {r}\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("**Kelime dağılımı:** ")
|
||||
f.write(", ".join(f"{k} ({v})" for k, v in kw_counter.most_common(10)))
|
||||
f.write("\n\n---\n\n")
|
||||
|
||||
for date_str in sorted(by_date):
|
||||
date_hits = sorted(by_date[date_str], key=lambda h: -h.get("weight", 0))
|
||||
month_pretty = MONTH_TR.get(date_hits[0]["month"], date_hits[0]["month"])
|
||||
f.write(f"## {date_str} _({month_pretty})_\n\n")
|
||||
for h in date_hits:
|
||||
f.write(f"### {h['slug']} — sayfa {h['page']} — `{h['keyword']}`\n\n")
|
||||
f.write(f"> {h['snippet']}\n\n")
|
||||
f.write(f"- Match: `{h['match']}` • Skor: {h.get('weight', 0)}")
|
||||
if h.get("priority_window"):
|
||||
f.write(" 🔥")
|
||||
f.write(f"\n- [PDF]({h['url']})\n\n")
|
||||
|
||||
f.write(f"\n---\n_ekos-gazete-search, {datetime.now().strftime('%Y-%m-%d %H:%M')}_\n")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument("--hits", default=str(HERE / "hits/kirim.jsonl"))
|
||||
ap.add_argument("--topic", default="Kirim",
|
||||
help="Used in filenames (e.g. Kirim → EKOS-Kirim-1932.md)")
|
||||
ap.add_argument("--vault", default=str(VAULT_BASE),
|
||||
help="Output base dir under vault")
|
||||
args = ap.parse_args()
|
||||
|
||||
hits_path = Path(args.hits)
|
||||
if not hits_path.exists() or hits_path.stat().st_size == 0:
|
||||
print(f"[!] No hits at {hits_path} — run 02_search_pdfs.py first")
|
||||
return
|
||||
|
||||
hits = load_hits(hits_path)
|
||||
if not hits:
|
||||
print(f"[!] hits file empty: {hits_path}")
|
||||
return
|
||||
|
||||
print(f"Loaded {len(hits)} hits")
|
||||
|
||||
vault_dir = Path(args.vault)
|
||||
vault_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
master_path = vault_dir / f"EKOS-{args.topic}-Bulgular.md"
|
||||
write_master(master_path, hits, args.topic)
|
||||
print(f" ✓ master → {master_path}")
|
||||
|
||||
by_year = defaultdict(list)
|
||||
for h in hits:
|
||||
by_year[h["year"]].append(h)
|
||||
|
||||
for year, year_hits in sorted(by_year.items()):
|
||||
year_path = vault_dir / f"EKOS-{args.topic}-{year}.md"
|
||||
write_yearly(year_path, year_hits, year, args.topic, master_path.stem)
|
||||
print(f" ✓ {year} ({len(year_hits)} hit) → {year_path.name}")
|
||||
|
||||
print(f"\n✓ Reports rendered under {vault_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
282
personas/_shared/skills/ekos-gazete-search/scripts/04_export.py
Normal file
282
personas/_shared/skills/ekos-gazete-search/scripts/04_export.py
Normal file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export hits/<topic>.jsonl into:
|
||||
reports/EKOS-<Topic>.csv — flat, all hits, chronological
|
||||
reports/EKOS-<Topic>-Rapor.docx — formatted Word report (TOC, top-30 smoking
|
||||
guns, per-year sections with snippets)
|
||||
|
||||
Examples:
|
||||
python scripts/04_export.py
|
||||
python scripts/04_export.py --topic Kirim --out-dir /home/salva/Documents/EKOS-out
|
||||
"""
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.shared import Pt, RGBColor, Cm
|
||||
|
||||
HERE = Path(__file__).resolve().parent.parent
|
||||
|
||||
MONTH_TR = {
|
||||
"ocak": ("Ocak", 1), "subat": ("Şubat", 2), "şubat": ("Şubat", 2),
|
||||
"mart": ("Mart", 3), "nisan": ("Nisan", 4), "mayis": ("Mayıs", 5),
|
||||
"mayıs": ("Mayıs", 5), "haziran": ("Haziran", 6), "temmuz": ("Temmuz", 7),
|
||||
"agustos": ("Ağustos", 8), "ağustos": ("Ağustos", 8),
|
||||
"eylul": ("Eylül", 9), "eylül": ("Eylül", 9), "ekim": ("Ekim", 10),
|
||||
"kasim": ("Kasım", 11), "kasım": ("Kasım", 11),
|
||||
"aralik": ("Aralık", 12), "aralık": ("Aralık", 12),
|
||||
"kanunusani": ("Ocak", 1), "kanunuevvel": ("Aralık", 12),
|
||||
"tesrinievvel": ("Ekim", 10), "tesrinisani": ("Kasım", 11),
|
||||
}
|
||||
|
||||
|
||||
def date_key(h):
|
||||
"""Sort key: (year, month_num, day)."""
|
||||
m = MONTH_TR.get(h["month"].lower(), (h["month"], 99))[1]
|
||||
try:
|
||||
d = int(h["day"])
|
||||
except Exception:
|
||||
d = 99
|
||||
return (int(h["year"]), m, d)
|
||||
|
||||
|
||||
def load_hits(path):
|
||||
hits = []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
hits.append(json.loads(line))
|
||||
return hits
|
||||
|
||||
|
||||
def write_csv(path, hits):
|
||||
"""All hits flat. Sort: chronological, then weight DESC within same date."""
|
||||
fields = ["year", "month", "day", "slug", "page",
|
||||
"keyword", "match", "weight",
|
||||
"priority_window", "window_reason", "snippet", "url"]
|
||||
sorted_hits = sorted(hits, key=lambda h: (date_key(h), -h.get("weight", 0)))
|
||||
with path.open("w", encoding="utf-8", newline="") as f:
|
||||
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
||||
w.writeheader()
|
||||
for h in sorted_hits:
|
||||
row = dict(h)
|
||||
row["snippet"] = (row.get("snippet") or "").replace("\n", " ").strip()
|
||||
w.writerow(row)
|
||||
return len(sorted_hits)
|
||||
|
||||
|
||||
def _set_cell_bold(cell, bold=True):
|
||||
for p in cell.paragraphs:
|
||||
for r in p.runs:
|
||||
r.bold = bold
|
||||
|
||||
|
||||
def write_docx(path, hits, topic):
|
||||
doc = Document()
|
||||
|
||||
# Margins
|
||||
for s in doc.sections:
|
||||
s.top_margin = Cm(2.0)
|
||||
s.bottom_margin = Cm(2.0)
|
||||
s.left_margin = Cm(2.0)
|
||||
s.right_margin = Cm(2.0)
|
||||
|
||||
# Title
|
||||
t = doc.add_heading(f"EKOS — {topic} Bulguları", level=0)
|
||||
t.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
sub = doc.add_paragraph()
|
||||
sub.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
r = sub.add_run("İstanbul Üniversitesi NEK Gazete Arşivi (1928-1942)")
|
||||
r.italic = True; r.font.size = Pt(11)
|
||||
|
||||
# Stats overview
|
||||
by_year = defaultdict(list)
|
||||
for h in hits:
|
||||
by_year[h["year"]].append(h)
|
||||
kw_counter = Counter(h["keyword"] for h in hits)
|
||||
slug_counter = Counter(h["slug"] for h in hits)
|
||||
priority_hits = [h for h in hits if h.get("priority_window")]
|
||||
|
||||
doc.add_paragraph()
|
||||
p = doc.add_paragraph()
|
||||
p.add_run("Üretim tarihi: ").bold = True
|
||||
p.add_run(datetime.now().strftime("%Y-%m-%d %H:%M"))
|
||||
p = doc.add_paragraph()
|
||||
p.add_run("Toplam vuruş: ").bold = True
|
||||
p.add_run(f"{len(hits)} ")
|
||||
p.add_run("Öncelikli pencere içinde: ").bold = True
|
||||
p.add_run(f"{len(priority_hits)}")
|
||||
p = doc.add_paragraph()
|
||||
p.add_run("Yıl aralığı: ").bold = True
|
||||
yrs = sorted(by_year)
|
||||
p.add_run(f"{yrs[0]} – {yrs[-1]} ")
|
||||
p.add_run("Gazete sayısı: ").bold = True
|
||||
p.add_run(f"{len(slug_counter)}")
|
||||
|
||||
# Yearly distribution table
|
||||
doc.add_heading("Yıllara Göre Dağılım", level=1)
|
||||
tbl = doc.add_table(rows=1, cols=3)
|
||||
tbl.style = "Light Grid Accent 1"
|
||||
hdr = tbl.rows[0].cells
|
||||
hdr[0].text = "Yıl"; hdr[1].text = "Toplam"; hdr[2].text = "Öncelikli"
|
||||
for c in hdr: _set_cell_bold(c, True)
|
||||
for y in yrs:
|
||||
row = tbl.add_row().cells
|
||||
row[0].text = y
|
||||
row[1].text = str(len(by_year[y]))
|
||||
row[2].text = str(sum(1 for h in by_year[y] if h.get("priority_window")))
|
||||
|
||||
# Keyword distribution
|
||||
doc.add_heading("Anahtar Kelime Dağılımı (top 20)", level=1)
|
||||
tbl = doc.add_table(rows=1, cols=2)
|
||||
tbl.style = "Light Grid Accent 1"
|
||||
hdr = tbl.rows[0].cells
|
||||
hdr[0].text = "Anahtar"; hdr[1].text = "Sayı"
|
||||
for c in hdr: _set_cell_bold(c, True)
|
||||
for kw, n in kw_counter.most_common(20):
|
||||
row = tbl.add_row().cells
|
||||
row[0].text = kw
|
||||
row[1].text = str(n)
|
||||
|
||||
# Slug productivity
|
||||
doc.add_heading("En Verimli Gazeteler", level=1)
|
||||
tbl = doc.add_table(rows=1, cols=2)
|
||||
tbl.style = "Light Grid Accent 1"
|
||||
hdr = tbl.rows[0].cells
|
||||
hdr[0].text = "Gazete"; hdr[1].text = "Vuruş"
|
||||
for c in hdr: _set_cell_bold(c, True)
|
||||
for slug, n in slug_counter.most_common(15):
|
||||
row = tbl.add_row().cells
|
||||
row[0].text = slug
|
||||
row[1].text = str(n)
|
||||
|
||||
# Top scored hits
|
||||
doc.add_page_break()
|
||||
doc.add_heading("En Yüksek Skorlu 30 Vuruş", level=1)
|
||||
doc.add_paragraph(
|
||||
"Skorlama: temel kelime ağırlığı + öncelikli pencere bonusu. "
|
||||
"Yüksek skorlu vuruşlar manuel okumada ilk öncelik."
|
||||
).italic = True
|
||||
top = sorted(hits, key=lambda h: -h.get("weight", 0))[:30]
|
||||
for i, h in enumerate(top, 1):
|
||||
m_pretty = MONTH_TR.get(h["month"].lower(), (h["month"], 0))[0]
|
||||
head = doc.add_paragraph()
|
||||
run = head.add_run(f"{i}. {h['slug']} — {h['year']} {m_pretty} {h['day']} — s. {h['page']}")
|
||||
run.bold = True; run.font.size = Pt(11)
|
||||
|
||||
meta = doc.add_paragraph()
|
||||
meta.add_run("Anahtar: ").bold = True
|
||||
meta.add_run(f"{h['keyword']} ")
|
||||
meta.add_run("Eşleşme: ").bold = True
|
||||
meta.add_run(f"{h['match']} ")
|
||||
meta.add_run("Skor: ").bold = True
|
||||
meta.add_run(f"{h.get('weight', 0)}")
|
||||
if h.get("priority_window"):
|
||||
wr = h.get("window_reason") or ""
|
||||
run2 = meta.add_run(f" [öncelikli: {wr[:60]}]")
|
||||
run2.italic = True
|
||||
run2.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
|
||||
|
||||
sn = doc.add_paragraph()
|
||||
sn.paragraph_format.left_indent = Cm(0.6)
|
||||
sn_run = sn.add_run(h.get("snippet", ""))
|
||||
sn_run.italic = True; sn_run.font.size = Pt(10)
|
||||
|
||||
url_p = doc.add_paragraph()
|
||||
url_p.paragraph_format.left_indent = Cm(0.6)
|
||||
url_run = url_p.add_run(f"PDF: {h['url']}")
|
||||
url_run.font.size = Pt(8)
|
||||
url_run.font.color.rgb = RGBColor(0x55, 0x55, 0x55)
|
||||
|
||||
# Per-year sections
|
||||
for year in yrs:
|
||||
doc.add_page_break()
|
||||
year_hits = sorted(by_year[year],
|
||||
key=lambda h: (date_key(h), -h.get("weight", 0)))
|
||||
prio = sum(1 for h in year_hits if h.get("priority_window"))
|
||||
doc.add_heading(f"{year} ({len(year_hits)} vuruş, {prio} öncelikli)",
|
||||
level=1)
|
||||
|
||||
# Quick keyword summary for the year
|
||||
yk = Counter(h["keyword"] for h in year_hits)
|
||||
s = doc.add_paragraph()
|
||||
s.add_run("Anahtar dağılımı: ").bold = True
|
||||
s.add_run(", ".join(f"{k}({v})" for k, v in yk.most_common(8)))
|
||||
|
||||
# Group by date
|
||||
by_date = defaultdict(list)
|
||||
for h in year_hits:
|
||||
key = (h["year"], h["month"], h["day"])
|
||||
by_date[key].append(h)
|
||||
|
||||
for dk in sorted(by_date, key=lambda k: (
|
||||
int(k[0]),
|
||||
MONTH_TR.get(k[1].lower(), (k[1], 99))[1],
|
||||
int(k[2]) if str(k[2]).isdigit() else 99)):
|
||||
y, m, d = dk
|
||||
m_pretty = MONTH_TR.get(m.lower(), (m, 0))[0]
|
||||
doc.add_heading(f"{y} {m_pretty} {d}", level=3)
|
||||
for h in by_date[dk]:
|
||||
p = doc.add_paragraph()
|
||||
p.add_run(f"{h['slug']} ").bold = True
|
||||
p.add_run(f"s.{h['page']} — ")
|
||||
kr = p.add_run(h["keyword"])
|
||||
kr.bold = True
|
||||
if h.get("priority_window"):
|
||||
kr.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
|
||||
p.add_run(f" (skor {h.get('weight', 0)})")
|
||||
|
||||
sn = doc.add_paragraph()
|
||||
sn.paragraph_format.left_indent = Cm(0.5)
|
||||
sr = sn.add_run(h.get("snippet", ""))
|
||||
sr.italic = True; sr.font.size = Pt(9)
|
||||
|
||||
# Footer
|
||||
doc.add_page_break()
|
||||
f = doc.add_paragraph()
|
||||
f.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
fr = f.add_run(f"Otomatik üretildi: ekos-gazete-search skill, "
|
||||
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||
fr.italic = True; fr.font.size = Pt(9)
|
||||
|
||||
doc.save(str(path))
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument("--hits", default=str(HERE / "hits/kirim.jsonl"))
|
||||
ap.add_argument("--topic", default="Kirim")
|
||||
ap.add_argument("--out-dir", default=str(HERE / "reports"))
|
||||
args = ap.parse_args()
|
||||
|
||||
hits_path = Path(args.hits)
|
||||
if not hits_path.exists() or hits_path.stat().st_size == 0:
|
||||
print(f"[!] No hits at {hits_path}")
|
||||
return
|
||||
|
||||
hits = load_hits(hits_path)
|
||||
print(f"Loaded {len(hits)} hits from {hits_path}")
|
||||
|
||||
out_dir = Path(args.out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
csv_path = out_dir / f"EKOS-{args.topic}.csv"
|
||||
docx_path = out_dir / f"EKOS-{args.topic}-Rapor.docx"
|
||||
|
||||
n = write_csv(csv_path, hits)
|
||||
print(f" ✓ CSV ({n} rows) → {csv_path}")
|
||||
|
||||
write_docx(docx_path, hits, args.topic)
|
||||
print(f" ✓ DOCX → {docx_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
173
personas/_shared/skills/ekos-gazete-search/scripts/lib/fuzzy.py
Normal file
173
personas/_shared/skills/ekos-gazete-search/scripts/lib/fuzzy.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
OCR-tolerant fuzzy regex builder for Turkish text.
|
||||
|
||||
Strategy: 2014-vintage Turkish OCR systematically destroys diacritics.
|
||||
Each character is replaced with a character class that covers all
|
||||
plausible OCR misreadings. See: keywords/kirim.yaml notes column.
|
||||
"""
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
# Character → tolerant character class mapping.
|
||||
# Order matters in DIACRITIC_CLASSES: lookup is case-folded.
|
||||
DIACRITIC_CLASSES = {
|
||||
# The ı/i/I/İ family — the most damaged
|
||||
'i': r'[1iIıİlj|!]',
|
||||
'ı': r'[1iIıİlj|!]',
|
||||
# Sibilants
|
||||
's': r'[s$ş]',
|
||||
'ş': r'[s$ş~]',
|
||||
# Plosives
|
||||
'c': r'[cç]',
|
||||
'ç': r'[cç]',
|
||||
'g': r'[gğ]',
|
||||
'ğ': r'[gğq]',
|
||||
# Vowels
|
||||
'u': r'(?:[uü]|ii)',
|
||||
'ü': r'(?:[uü]|ii)',
|
||||
'o': r'[oö0]',
|
||||
'ö': r'[oö0]',
|
||||
'a': r'[aâå]',
|
||||
'â': r'[aâå]',
|
||||
'e': r'[eé]',
|
||||
}
|
||||
|
||||
# Non-letter separators in OCR can be space, dash, underscore, tilde, dot.
|
||||
WORD_SEP = r'[\s\-_~.,]+'
|
||||
|
||||
|
||||
def turkish_lower(s: str) -> str:
|
||||
"""Turkish-aware lowercase: İ→i, I→ı."""
|
||||
return s.replace('İ', 'i').replace('I', 'ı').lower()
|
||||
|
||||
|
||||
def build_pattern(word: str) -> str:
|
||||
"""Build OCR-tolerant regex for a single word or phrase."""
|
||||
parts = []
|
||||
for ch in word:
|
||||
lower = turkish_lower(ch)
|
||||
if lower in DIACRITIC_CLASSES:
|
||||
parts.append(DIACRITIC_CLASSES[lower])
|
||||
elif ch == ' ':
|
||||
parts.append(WORD_SEP)
|
||||
elif ch.isalpha():
|
||||
# Plain ASCII letter — case-insensitive
|
||||
parts.append(f'[{ch.lower()}{ch.upper()}]')
|
||||
else:
|
||||
parts.append(re.escape(ch))
|
||||
# Word boundaries: \b doesn't work well with character classes,
|
||||
# so use lookarounds for non-letter context.
|
||||
return r'(?<![\wıİşŞçÇğĞüÜöÖâÂ])' + ''.join(parts) + r'(?![\wıİşŞçÇğĞüÜöÖâÂ])'
|
||||
|
||||
|
||||
def build_pattern_with_suffixes(word: str, suffixes: list = None) -> str:
|
||||
"""Build pattern allowing optional Turkish suffixes."""
|
||||
base = build_pattern(word)
|
||||
# Strip trailing boundary, add suffix group, re-add boundary
|
||||
base_no_end = base[:-len(r'(?![\wıİşŞçÇğĞüÜöÖâÂ])')]
|
||||
if suffixes:
|
||||
suffix_alts = '|'.join(re.escape(s) for s in suffixes)
|
||||
suffix_group = rf'(?:{suffix_alts})?'
|
||||
return base_no_end + suffix_group + r'(?![\wıİşŞçÇğĞüÜöÖâÂ])'
|
||||
return base
|
||||
|
||||
|
||||
def compile_keyword_set(keyword_data: dict) -> list:
|
||||
"""
|
||||
Compile a YAML keyword set into a list of (label, weight, regex) tuples.
|
||||
Higher-weight matches surface first in reports.
|
||||
"""
|
||||
compiled = []
|
||||
# Main keywords
|
||||
for kw in keyword_data.get('keywords', []):
|
||||
canonical = kw['canonical']
|
||||
aliases = kw.get('aliases', [])
|
||||
suffixes = kw.get('suffixes', [])
|
||||
weight = kw.get('weight', 3)
|
||||
for term in [canonical] + aliases:
|
||||
try:
|
||||
pat = build_pattern_with_suffixes(term, suffixes)
|
||||
compiled.append((canonical, weight, re.compile(pat, re.IGNORECASE | re.UNICODE)))
|
||||
except re.error as e:
|
||||
print(f" [warn] regex compile failed for {term!r}: {e}")
|
||||
# Proper nouns (smoking guns)
|
||||
for pn in keyword_data.get('proper_nouns', []):
|
||||
canonical = pn['canonical']
|
||||
aliases = pn.get('aliases', [])
|
||||
weight = pn.get('weight', 5)
|
||||
for term in [canonical] + aliases:
|
||||
try:
|
||||
pat = build_pattern(term)
|
||||
compiled.append((canonical, weight, re.compile(pat, re.IGNORECASE | re.UNICODE)))
|
||||
except re.error as e:
|
||||
print(f" [warn] regex compile failed for {term!r}: {e}")
|
||||
return compiled
|
||||
|
||||
|
||||
def compile_disambiguators(keyword_data: dict) -> list:
|
||||
"""Compile false-positive filter patterns."""
|
||||
return [
|
||||
re.compile(build_pattern(term), re.IGNORECASE | re.UNICODE)
|
||||
for term in keyword_data.get('disambiguators', [])
|
||||
]
|
||||
|
||||
|
||||
def is_false_positive(text: str, match_start: int, match_end: int,
|
||||
disambiguators: list, window: int = 50) -> bool:
|
||||
"""Check if match falls inside a disambiguator (e.g., 'Kerim Bey' near 'Kırım')."""
|
||||
win_start = max(0, match_start - window)
|
||||
win_end = min(len(text), match_end + window)
|
||||
window_text = text[win_start:win_end]
|
||||
for dis_re in disambiguators:
|
||||
if dis_re.search(window_text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_snippet(text: str, match_start: int, match_end: int,
|
||||
radius: int = 200) -> str:
|
||||
"""Extract a clean ±radius snippet around a match."""
|
||||
s = max(0, match_start - radius)
|
||||
e = min(len(text), match_end + radius)
|
||||
snip = text[s:e]
|
||||
# Collapse whitespace, drop weird control chars
|
||||
snip = re.sub(r'\s+', ' ', snip).strip()
|
||||
snip = ''.join(c for c in snip if c.isprintable() or c in ' \n')
|
||||
return snip
|
||||
|
||||
|
||||
def co_occurrence_score(text: str, term_a: str, term_b: str,
|
||||
compiled_patterns: dict, window: int = 300) -> int:
|
||||
"""
|
||||
Count how many times term_a and term_b appear within `window` chars of each other.
|
||||
Used by report renderer for boost scoring.
|
||||
"""
|
||||
if term_a not in compiled_patterns or term_b not in compiled_patterns:
|
||||
return 0
|
||||
a_positions = [m.start() for m in compiled_patterns[term_a].finditer(text)]
|
||||
b_positions = [m.start() for m in compiled_patterns[term_b].finditer(text)]
|
||||
score = 0
|
||||
for ap in a_positions:
|
||||
for bp in b_positions:
|
||||
if abs(ap - bp) <= window:
|
||||
score += 1
|
||||
return score
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Smoke test
|
||||
test_words = ['Kırım', 'Bahçesaray', 'Cafer Seydahmet', 'İsmail Gaspıralı']
|
||||
test_text = """
|
||||
OCR çöplüğü:
|
||||
K1r1m Tatarlari hakkinda bir haber.
|
||||
Bahcesaray'da bir hadise.
|
||||
K~r~m Hanl1g1 tarihi.
|
||||
Cafer Seydamet Bey istanbula geldi.
|
||||
Ismail Gaspirali'nin 1934 anma toplantisi.
|
||||
Kerim Bey ile karistirma — bu yanlis pozitif.
|
||||
"""
|
||||
for w in test_words:
|
||||
pat = build_pattern(w)
|
||||
print(f"\n{w!r} → {pat}")
|
||||
for m in re.finditer(pat, test_text, re.IGNORECASE):
|
||||
print(f" hit: {m.group(0)!r} @ {m.start()}")
|
||||
@@ -0,0 +1,3 @@
|
||||
requests>=2.31
|
||||
PyYAML>=6.0
|
||||
python-docx>=1.0
|
||||
50
personas/_shared/skills/ekos-gazete-search/scripts/run_capped.sh
Executable file
50
personas/_shared/skills/ekos-gazete-search/scripts/run_capped.sh
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run the EKOS PDF searcher inside a transient systemd user-unit with
|
||||
# CPU + memory caps. All extra args are forwarded to 02_search_pdfs.py.
|
||||
#
|
||||
# Profile env vars (override before invocation):
|
||||
# EKOS_CPU_QUOTA default 300% (3 cores)
|
||||
# EKOS_MEM_MAX default 3G
|
||||
# EKOS_UNIT default ekos-search-<timestamp>
|
||||
#
|
||||
# Examples:
|
||||
# bash scripts/run_capped.sh --slug cumhuriyet --priority-only --year-to 1931 --workers 2
|
||||
# EKOS_CPU_QUOTA=500% EKOS_MEM_MAX=4G bash scripts/run_capped.sh --priority-only --workers 4
|
||||
#
|
||||
# Monitor:
|
||||
# systemctl --user status <unit>
|
||||
# journalctl --user -u <unit> -f
|
||||
# systemctl --user stop <unit>
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
PY="${HERE}/.venv/bin/python"
|
||||
SCRIPT="${HERE}/scripts/02_search_pdfs.py"
|
||||
|
||||
CPU_QUOTA="${EKOS_CPU_QUOTA:-300%}"
|
||||
MEM_MAX="${EKOS_MEM_MAX:-3G}"
|
||||
UNIT="${EKOS_UNIT:-ekos-search-$(date +%Y%m%d-%H%M%S)}"
|
||||
|
||||
if [[ ! -x "$PY" ]]; then
|
||||
echo "venv not found: $PY" >&2
|
||||
echo "create with: cd $HERE && python3 -m venv .venv && .venv/bin/pip install -r scripts/requirements.txt" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Unit: $UNIT"
|
||||
echo "CPUQuota: $CPU_QUOTA"
|
||||
echo "MemoryMax: $MEM_MAX"
|
||||
echo "Forward: $*"
|
||||
echo
|
||||
|
||||
exec systemd-run --user \
|
||||
--unit="$UNIT" \
|
||||
--working-directory="$HERE" \
|
||||
-p "CPUQuota=$CPU_QUOTA" \
|
||||
-p "MemoryMax=$MEM_MAX" \
|
||||
-p "MemorySwapMax=1G" \
|
||||
-p "Nice=10" \
|
||||
-p "IOWeight=50" \
|
||||
--setenv=PYTHONUNBUFFERED=1 \
|
||||
"$PY" "$SCRIPT" "$@"
|
||||
229
personas/_shared/skills/telegram/SKILL.md
Normal file
229
personas/_shared/skills/telegram/SKILL.md
Normal file
@@ -0,0 +1,229 @@
|
||||
---
|
||||
name: telegram
|
||||
description: Use when reading, searching, sending, or managing Telegram messages and folders for the user's personal account. Triggers on "Telegram'a mesaj gönder", "şu kanaldan son mesajları getir", "Telegram'da ara", "okunmamış mesajlar", "Telegram klasörlerini güncelle", "yeni kanalları kategorize et", "fetch telegram dialogs", "telegram inbox", "@username'e şunu yaz", or any direct mention of fetch_all/tg_read/tg_send/tg_search/tg_inbox/apply_folders. Also covers the Telethon-based pipeline at /home/salva/Documents/telegram (auth, session, channels.json, assignments.json).
|
||||
---
|
||||
|
||||
# Telegram Operator (Telethon)
|
||||
|
||||
## Overview
|
||||
|
||||
Read, search, send, manage, and organize the user's Telegram personal account from the command line via Telethon. All scripts share one venv and one `.session` file at `/home/salva/Documents/telegram/`.
|
||||
|
||||
```
|
||||
┌── tg_read.py (fetch from a chat)
|
||||
├── tg_send.py (send text/file, reply, silent)
|
||||
Telethon client ────┼── tg_search.py (global / scoped search)
|
||||
(one .session) ├── tg_inbox.py (unread overview, mark-read)
|
||||
└── folder pipeline:
|
||||
fetch_all.py → build_assignments.py → apply_folders.py
|
||||
```
|
||||
|
||||
## Project location
|
||||
|
||||
`/home/salva/Documents/telegram/`
|
||||
|
||||
| File | Role |
|
||||
|---|---|
|
||||
| `api.txt` | api_id / api_hash from my.telegram.org. **Do not commit.** |
|
||||
| `config.py` | Loads creds → `API_ID`, `API_HASH`, `SESSION_NAME` |
|
||||
| `telegram_session.session` | Telethon SQLite session. **Do not delete unless re-login needed.** |
|
||||
| `venv/` | Project venv, activate with `source venv/bin/activate` |
|
||||
| `requirements.txt` | `telethon>=1.43.1` |
|
||||
| `tg_utils.py` | Shared helpers: `resolve_chat`, `fmt_msg`, `confirm`, `parse_date` |
|
||||
| `tg_read.py` | Read messages from a chat |
|
||||
| `tg_send.py` | Send text and/or file (interactive confirm by default) |
|
||||
| `tg_search.py` | Search messages globally or in one chat |
|
||||
| `tg_inbox.py` | Unread overview + mark-as-read (single or bulk) |
|
||||
| `fetch_all.py` | Snapshot all dialogs + 40 messages each → `data/channels.json` |
|
||||
| `build_assignments.py` | Static id→folder map → `data/assignments.json` |
|
||||
| `apply_folders.py` | Push folder layout to Telegram (interactive y/N) |
|
||||
| `categorize.py` | Library helper used if pipeline grows beyond static dict |
|
||||
| `data/` | All JSON outputs (channels, assignments, compact, names.tsv, …) |
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
cd /home/salva/Documents/telegram
|
||||
source venv/bin/activate # or: python3 -m venv venv && pip install -r requirements.txt
|
||||
```
|
||||
|
||||
First run on a new machine: any script will prompt for phone number → SMS code → 2FA password (if set), then writes `telegram_session.session`. Subsequent runs are silent.
|
||||
|
||||
If you see `AuthKeyUnregisteredError` or `SessionPasswordNeededError` after a long absence: delete the `.session` file and re-login.
|
||||
|
||||
## Common chat references
|
||||
|
||||
Every script accepts the same `chat` argument forms:
|
||||
|
||||
- `"@username"` — public username (channels, bots, users)
|
||||
- `12345` or `-1001234567890` — numeric id (positive = user, negative = group/channel)
|
||||
- `"some name"` — case-insensitive substring of the dialog name; errors out if 0 or >1 matches, listing the candidates
|
||||
- `"me"` / `"self"` — Saved Messages (your own DM-to-self chat)
|
||||
|
||||
## Reading
|
||||
|
||||
```bash
|
||||
python tg_read.py "@durov" # last 20 messages
|
||||
python tg_read.py "Born2beroot" --limit 100
|
||||
python tg_read.py "@channel" --since 2026-04-01 # since a date
|
||||
python tg_read.py "@x" --search "CVE" # filter inside the chat
|
||||
python tg_read.py "@x" --json # machine-readable output
|
||||
python tg_read.py "@x" --mark-read # also clear the unread badge
|
||||
```
|
||||
|
||||
Output format: `id │ YYYY-MM-DD HH:MM │ sender(20) │ text(200)` — one row per message, `--json` flips to a JSON array with `{id,date,sender_id,text,has_media,reply_to}` per item.
|
||||
|
||||
## Sending
|
||||
|
||||
```bash
|
||||
python tg_send.py "@user" "Hello" # interactive [y/N]
|
||||
python tg_send.py "me" "note to self" --yes # auto-confirm
|
||||
python tg_send.py "@chan" "Caption" --file report.pdf
|
||||
python tg_send.py "@x" "" --file img.png --caption "ss" # file-only
|
||||
python tg_send.py "@x" "Reply" --reply-to 12345
|
||||
python tg_send.py "@x" "ping" --silent # no notification
|
||||
python tg_send.py "@x" "<b>bold</b>" --parse html
|
||||
```
|
||||
|
||||
**Send safety policy** — by default, `tg_send.py` prints a preview of the destination + payload and asks `Gönder? [y/N]` before transmitting. Pass `--yes` (or `-y`) to skip the prompt for scripted/automated runs. This matches the convention used by `apply_folders.py`.
|
||||
|
||||
Default text parse mode is **markdown**. Use `--parse html` for HTML-style entities (`<b>`, `<i>`, `<a href=…>`), or `--parse none` for plain.
|
||||
|
||||
## Searching
|
||||
|
||||
```bash
|
||||
python tg_search.py "ransomware" # global, last 50 hits
|
||||
python tg_search.py "Putin" --since 2026-04-01 -n 200
|
||||
python tg_search.py "kitap" --chat "E Kitap PDF" # scoped to one chat
|
||||
python tg_search.py "report" --chat me
|
||||
```
|
||||
|
||||
Global search uses Telegram's server-side message index. Each hit is prefixed with the chat's title in `[brackets]`. Scoped search (`--chat`) is faster and avoids the per-chat title resolution lookup.
|
||||
|
||||
## Inbox / unread management
|
||||
|
||||
```bash
|
||||
python tg_inbox.py # ranked by unread count
|
||||
python tg_inbox.py --top 20
|
||||
python tg_inbox.py --include-archived # include archived folder
|
||||
python tg_inbox.py --mark-read "Born2bero" # clear ONE chat
|
||||
python tg_inbox.py --mark-all-read # clear EVERY unread (asks y/N)
|
||||
python tg_inbox.py --mark-all-read --yes # … or skip prompt
|
||||
```
|
||||
|
||||
The bulk `--mark-all-read` is destructive on the unread badge state and irreversible — there is no "mark-as-unread" RPC. The script always confirms unless `--yes`.
|
||||
|
||||
## Folder pipeline (≈600 dialogs → 9 folders)
|
||||
|
||||
3-stage workflow for organizing dialogs into Telegram client-side folders:
|
||||
|
||||
```bash
|
||||
python fetch_all.py # ~1-3 min, refreshes data/channels.json
|
||||
python build_assignments.py # warns about ⚠ unassigned ids
|
||||
# → if warnings: edit build_assignments.py:A, add the new ids, rerun
|
||||
python apply_folders.py # interactive y/N to push to Telegram
|
||||
```
|
||||
|
||||
### Folder schema (current — titles capped at 12 chars by Telegram)
|
||||
|
||||
| Emoji | Title | Scope |
|
||||
|---|---|---|
|
||||
| 🛡 | `Güvenlik` | Cybersec, hacking, intel feeds, OSINT, ham radio |
|
||||
| ☁ | `Logs & Cloud` | Cloud account dumps, ULP/redline logs, cracked services |
|
||||
| ⚔ | `Rus-Ukrayna` | Russia/Ukraine war channels, both sides + Western trackers |
|
||||
| 🕌 | `Ortadoğu` | Middle East news (Arabic/Persian/Turkish/English) |
|
||||
| 🎖 | `Askeri Jeo` | Turkish military, geopolitics, MGK, defense industry |
|
||||
| 📚 | `E-Kitap` | E-books, audiobooks, manga, KPSS/YKS material |
|
||||
| 🌐 | `Dil & Kurs` | Russian/Swahili/English language groups, Udemy/PacktPub |
|
||||
| 📈 | `Finans` | Borsa İstanbul, trading, stock tips, central bank |
|
||||
| 💬 | `Sosyal` | Twitch, social, hobby groups, anything else |
|
||||
|
||||
Full id→folder map: `build_assignments.py:A` (~260 entries). Edit the dict, **never** edit `data/assignments.json` directly — `build_assignments.py` regenerates it.
|
||||
|
||||
### New-channel triage (unassigned id heuristic)
|
||||
|
||||
When `build_assignments.py` reports `⚠ assignment eksik`, read the channel's name and first messages from `data/channels.json`, then assign by these rules (first match wins):
|
||||
|
||||
```
|
||||
HACK / CVE / exploit / SOC / OSINT / red team / siber → Güvenlik
|
||||
cloud-free / ulp / redline / cracked / leaked logs / vbv → Logs & Cloud
|
||||
Ukraine / Russia / Donbas / Kyiv / Москва (war context) → Rus-Ukrayna
|
||||
Arabic-script (ar/fa) news, Israel/Gaza/Syria/Iran → Ortadoğu
|
||||
TSK / SİHA / NATO / geopolitics / military doctrine → Askeri Jeo
|
||||
PDF / kitap / e-book / sesli kitap / manga / KPSS / YKS → E-Kitap
|
||||
Udemy / Coursera / Russian/Swahili/Arabic/French/IELTS → Dil & Kurs
|
||||
borsa / hisse / trading / forex / kripto → Finans
|
||||
twitch / hobby / chat / barahol / banter → Sosyal
|
||||
```
|
||||
|
||||
Edge cases:
|
||||
- Russia/Ukraine **doctrine** (not war news) → `Askeri Jeo`, not `Rus-Ukrayna`.
|
||||
- Stock-tip Udemy channels → `Finans`, not `Dil & Kurs`.
|
||||
- Sesli Kitap / Manga / KPSS folded into `E-Kitap`.
|
||||
|
||||
## Telethon API constraints
|
||||
|
||||
- `DialogFilter.id` 0 and 1 are reserved (All Chats, etc.); `apply_folders.py` skips them.
|
||||
- Folder titles capped at **12 characters** by Telegram. Telegram allows up to 30 folders (100 with Premium); current schema uses 9.
|
||||
- `iter_dialogs(archived=None)` returns both normal and archived; `archived=False` (default in `tg_inbox.py`) returns only normal.
|
||||
- `iter_messages(entity, search=...)` is server-side full-text; `iter_messages(None, search=...)` is the global search.
|
||||
- Rate limits: don't run `fetch_all.py` more than ~once per hour for accounts with many dialogs (`FloodWaitError`). For sending in tight loops, sleep ≥1s between messages or be ready to handle `FloodWait`.
|
||||
- `client.send_read_acknowledge(entity)` clears unread; there is no inverse RPC to mark unread.
|
||||
|
||||
## Auth & secrets
|
||||
|
||||
- `api.txt` and `telegram_session.session` are **as good as a password**: anyone with both can read all your messages and send as you. Keep them out of git, dotfiles sync, and shared backups.
|
||||
- The MTProto session is bound to the device fingerprint Telethon presents. Telegram → Settings → Devices lists active sessions; revoke "TelegramTUI / linux" entries you don't recognize.
|
||||
- 2FA password (cloud password) is **not** stored in `.session`; you'll be prompted on first login if it's set.
|
||||
|
||||
## When NOT to run
|
||||
|
||||
- `apply_folders.py` overwrites each folder's `include_peers` — manual folder rearrangements in the Telegram client are lost. Always confirm before pushing.
|
||||
- `tg_send.py` and `tg_inbox.py --mark-all-read` are destructive in the "user-visible-side-effect" sense; default behavior is interactive confirm. Don't `--yes` blindly in a script unless the destination/payload is hard-coded and reviewed.
|
||||
- `fetch_all.py` more than ~hourly: triggers `FloodWaitError` for large accounts.
|
||||
|
||||
## Snippets cookbook
|
||||
|
||||
```python
|
||||
# One-off custom run inside the same venv:
|
||||
import asyncio
|
||||
from telethon import TelegramClient
|
||||
from config import API_ID, API_HASH, SESSION_NAME
|
||||
from tg_utils import resolve_chat, fmt_msg
|
||||
|
||||
async def main():
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as c:
|
||||
e = await resolve_chat(c, "@durov")
|
||||
async for m in c.iter_messages(e, limit=5):
|
||||
print(fmt_msg(m))
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python
|
||||
# Live monitoring (event handler):
|
||||
from telethon import events
|
||||
@client.on(events.NewMessage(chats=["@channel1", "@channel2"]))
|
||||
async def handler(event):
|
||||
print(event.chat.title, event.message.text)
|
||||
client.run_until_disconnected()
|
||||
```
|
||||
|
||||
```python
|
||||
# Forward N messages from A to B:
|
||||
msgs = await client.get_messages(src_entity, ids=[101, 102, 103])
|
||||
await client.forward_messages(dst_entity, msgs)
|
||||
```
|
||||
|
||||
```python
|
||||
# Download all media from a chat into ./media/:
|
||||
async for msg in client.iter_messages(entity, limit=100):
|
||||
if msg.media:
|
||||
await msg.download_media(file="media/")
|
||||
```
|
||||
|
||||
## Related skills
|
||||
|
||||
- `obsidian-tasks` — track Telegram-organization items as tasks.
|
||||
- `news-crawler`, `freshrss`, `freshrss-reader` — alternative news ingestion paths; `Askeri Jeo`/`Ortadoğu` Telegram channels overlap with FreshRSS feeds.
|
||||
- `obsidian-linux` — once messages are extracted, can convert into vault notes via `notesmd-cli`.
|
||||
98
personas/_shared/skills/telegram/scripts/apply_folders.py
Normal file
98
personas/_shared/skills/telegram/scripts/apply_folders.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
ADIM 3 — data/assignments.json'daki 10 klasörü Telegram'da oluştur/güncelle.
|
||||
|
||||
assignments.json formatı:
|
||||
{
|
||||
"folders": [{"title": "...", "emoticon": "🛡"}, ...],
|
||||
"assignments": {"<channel_id>": "FolderTitle", ...}
|
||||
}
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from telethon import TelegramClient
|
||||
from telethon.tl.functions.messages import (
|
||||
GetDialogFiltersRequest,
|
||||
UpdateDialogFilterRequest,
|
||||
)
|
||||
from telethon.tl.types import DialogFilter, TextWithEntities
|
||||
|
||||
from config import API_HASH, API_ID, SESSION_NAME
|
||||
|
||||
DATA_FILE = Path(__file__).parent / "data" / "assignments.json"
|
||||
|
||||
|
||||
def _title_text(f) -> str:
|
||||
t = getattr(f, "title", None)
|
||||
if t is None:
|
||||
return ""
|
||||
return t.text if hasattr(t, "text") else str(t)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
cfg = json.loads(DATA_FILE.read_text(encoding="utf-8"))
|
||||
folders_meta = cfg["folders"] # sıralı, emoji'li
|
||||
assignments: dict[str, str] = cfg["assignments"] # "id" -> title
|
||||
|
||||
buckets: dict[str, list[int]] = defaultdict(list)
|
||||
for sid, title in assignments.items():
|
||||
buckets[title].append(int(sid))
|
||||
|
||||
print("Önizleme:")
|
||||
for f in folders_meta:
|
||||
n = len(buckets.get(f["title"], []))
|
||||
print(f" {f['emoticon']} {f['title']:<22} {n:>3} sohbet")
|
||||
|
||||
if input("\nTelegram'a uygula? [y/N]: ").strip().lower() != "y":
|
||||
print("iptal.")
|
||||
return
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
resp = await client(GetDialogFiltersRequest())
|
||||
existing = resp.filters if hasattr(resp, "filters") else resp
|
||||
by_title: dict[str, DialogFilter] = {}
|
||||
used_ids: set[int] = {0, 1}
|
||||
for f in existing:
|
||||
if isinstance(f, DialogFilter):
|
||||
by_title[_title_text(f)] = f
|
||||
used_ids.add(f.id)
|
||||
|
||||
next_id = max(used_ids) + 1
|
||||
|
||||
for fmeta in folders_meta:
|
||||
title = fmeta["title"]
|
||||
ids = buckets.get(title, [])
|
||||
include_peers = []
|
||||
for cid in ids:
|
||||
try:
|
||||
include_peers.append(await client.get_input_entity(cid))
|
||||
except Exception as e:
|
||||
print(f" ! {cid} eklenemedi: {e}")
|
||||
|
||||
if title in by_title:
|
||||
fid = by_title[title].id
|
||||
action = "güncellendi"
|
||||
else:
|
||||
fid = next_id
|
||||
next_id += 1
|
||||
action = "oluşturuldu"
|
||||
|
||||
df = DialogFilter(
|
||||
id=fid,
|
||||
title=TextWithEntities(text=title, entities=[]),
|
||||
pinned_peers=[],
|
||||
include_peers=include_peers,
|
||||
exclude_peers=[],
|
||||
contacts=False, non_contacts=False, groups=False,
|
||||
broadcasts=False, bots=False,
|
||||
exclude_muted=False, exclude_read=False, exclude_archived=False,
|
||||
emoticon=fmeta.get("emoticon"),
|
||||
)
|
||||
await client(UpdateDialogFilterRequest(id=fid, filter=df))
|
||||
print(f"✓ {fmeta['emoticon']} {title} ({len(include_peers)}) — {action}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
296
personas/_shared/skills/telegram/scripts/build_assignments.py
Normal file
296
personas/_shared/skills/telegram/scripts/build_assignments.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""Claude'un elle yaptığı kategorizasyon → data/assignments.json."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Telegram klasör ismi en fazla 12 karakter.
|
||||
FOLDERS = [
|
||||
{"title": "Güvenlik", "emoticon": "🛡"},
|
||||
{"title": "Logs & Cloud", "emoticon": "☁"},
|
||||
{"title": "Rus-Ukrayna", "emoticon": "⚔"},
|
||||
{"title": "Ortadoğu", "emoticon": "🕌"},
|
||||
{"title": "Askeri Jeo", "emoticon": "🎖"},
|
||||
{"title": "E-Kitap", "emoticon": "📚"}, # Sesli Manga + KPSS da burada
|
||||
{"title": "Dil & Kurs", "emoticon": "🌐"},
|
||||
{"title": "Finans", "emoticon": "📈"}, # yeni: borsa/trading
|
||||
{"title": "Sosyal", "emoticon": "💬"}, # kalan sosyal/eğlence
|
||||
]
|
||||
|
||||
# id → folder title
|
||||
A = {
|
||||
# --- Siber Güvenlik ---
|
||||
-1003772746107: "Güvenlik", # Born2beroot
|
||||
-1001182095274: "Güvenlik", # Cyber Threat Intelligence Feeds
|
||||
-1001601457644: "Güvenlik", # Linux Türkiye Topluluğu
|
||||
-5245874036: "Güvenlik", # APT10
|
||||
-1001424015690: "Güvenlik", # Siber Kulüpler Birliği
|
||||
-1002044403490: "Güvenlik", # Siber Güvenlik Turkey
|
||||
-1001433765532: "Güvenlik", # Türkiye Amatör Telsiz
|
||||
-1001448773154: "Güvenlik", # tinyGS Community
|
||||
-1001248961775: "Güvenlik", # Geek Hacker
|
||||
-1001820205147: "Güvenlik", # PSD
|
||||
-1001486620605: "Güvenlik", # OpenStreetMap Türkiye
|
||||
-1001224374951: "Güvenlik", # Siberdinc
|
||||
-1001175709038: "Güvenlik", # Özgür Yazılım Derneği
|
||||
-1001102366261: "Güvenlik", # Dark Web Intelligence
|
||||
-1001705864902: "Güvenlik", # HackCodeRepeat
|
||||
-4895889925: "Güvenlik", # Born2beroot (küçük)
|
||||
-1001560793071: "Güvenlik", # ForenSec
|
||||
-1001099338447: "Güvenlik", # burpsuite (unofficial)
|
||||
-1002019701877: "Güvenlik", # sc_sibermagazin
|
||||
-1001968311017: "Güvenlik", # CVE
|
||||
-4762809106: "Güvenlik", # RaCONF'25
|
||||
-4960718009: "Güvenlik", # OpZ
|
||||
-1001425186624: "Güvenlik", # Zer0Day Lab
|
||||
-1001369540037: "Güvenlik", # inj3ct0r exploit db
|
||||
-1002601559408: "Güvenlik", # Garuda Error System
|
||||
-1002389372004: "Güvenlik", # AnonSec (hacker crew)
|
||||
|
||||
# --- Logs & Cloud ---
|
||||
-1002696769378: "Logs & Cloud", # Valide Cloud Free
|
||||
-1001628710143: "Logs & Cloud", # Omega Cloud
|
||||
-1001921972180: "Logs & Cloud", # Burn Cloud
|
||||
-1001939548708: "Logs & Cloud", # Trident Cloud
|
||||
-1001602298018: "Logs & Cloud", # Free xbox game pass
|
||||
-1002231096661: "Logs & Cloud", # Vpesports Xbox
|
||||
-1002592627432: "Logs & Cloud", # Cvv190 Cloud
|
||||
-1002107853176: "Logs & Cloud", # Plutonium logs
|
||||
-1002047552897: "Logs & Cloud", # Бесплатный лицензионный
|
||||
-1002575521311: "Logs & Cloud", # Darknes-Cloud
|
||||
-1002355411584: "Logs & Cloud", # Roves Cloud
|
||||
-1002025418650: "Logs & Cloud", # Valide Cloud FREE
|
||||
-1002849195507: "Logs & Cloud", # azef cloud
|
||||
-1001440229722: "Logs & Cloud", # Freedom F0x
|
||||
-1002294768789: "Logs & Cloud", # D49d3k ULP-Cloud
|
||||
-1002415889954: "Logs & Cloud", # scale invite
|
||||
-1001773319933: "Logs & Cloud", # CRYPTOLOGS REDLINE
|
||||
-1001578557816: "Logs & Cloud", # Link Arşivleri
|
||||
-1001672949739: "Logs & Cloud", # BerserkLogs
|
||||
|
||||
# --- Rus-Ukrayna ---
|
||||
-1001668977160: "Rus-Ukrayna", # Rybar in English
|
||||
-1001326223284: "Rus-Ukrayna", # Рыбарь
|
||||
-1001082968817: "Rus-Ukrayna", # Минобороны России
|
||||
-1001513431778: "Rus-Ukrayna", # Два майора
|
||||
-1001475819126: "Rus-Ukrayna", # Роскосмос
|
||||
-1001220606936: "Rus-Ukrayna", # STERNENKO
|
||||
-1001783035076: "Rus-Ukrayna", # TrackANaziMerc
|
||||
-1001003313758: "Rus-Ukrayna", # Новости Москвы
|
||||
-1001654562332: "Rus-Ukrayna", # TASS
|
||||
-1001386375324: "Rus-Ukrayna", # МВС України
|
||||
-1001173684180: "Rus-Ukrayna", # ЧП / Крым
|
||||
-1001310984791: "Rus-Ukrayna", # Intel Slava
|
||||
-1001747148099: "Rus-Ukrayna", # Судоплатов
|
||||
-1002121256650: "Rus-Ukrayna", # Угруповання об'єднаних сил
|
||||
-1001583313036: "Rus-Ukrayna", # АРХАНГЕЛ СПЕЦНАЗА
|
||||
-1001352726486: "Rus-Ukrayna", # INSIDER UA
|
||||
-1001669110938: "Rus-Ukrayna", # UNITED24Media
|
||||
-1001350274993: "Rus-Ukrayna", # Tim Kirby Russia Hardcore
|
||||
-1001463721328: "Rus-Ukrayna", # Zelenskiy Official
|
||||
-1001117303064: "Rus-Ukrayna", # Россия в глобальной политике
|
||||
-1001509172593: "Rus-Ukrayna", # monitorwar
|
||||
-1001222633586: "Rus-Ukrayna", # FEDOROV
|
||||
-1001469021333: "Rus-Ukrayna", # DeepState
|
||||
-1001616052141: "Rus-Ukrayna", # Проект «Хочу жить»
|
||||
-1001900958834: "Rus-Ukrayna", # Ігор Клименко МВС
|
||||
-1001617325371: "Rus-Ukrayna", # Десантно-штурмові війська ЗСУ
|
||||
-1002490955621: "Rus-Ukrayna", # DIPLOMATIE RUSSE
|
||||
-1001385909762: "Rus-Ukrayna", # Артем Дмитрук
|
||||
-1001764041965: "Rus-Ukrayna", # Kremlin News EN
|
||||
-1001790907266: "Rus-Ukrayna", # Кремль Новости RU
|
||||
-1003222724492: "Rus-Ukrayna", # Ionfall
|
||||
-1001936622736: "Rus-Ukrayna", # ЖАХ З НЕБЕС 123
|
||||
-1002029042694: "Rus-Ukrayna", # 123 омсбр
|
||||
-1002051535105: "Rus-Ukrayna", # 114 Бригада
|
||||
|
||||
# --- Ortadoğu ---
|
||||
-1002062736232: "Ortadoğu", # نايا - NAYA
|
||||
-1002059959435: "Ortadoğu", # UAE MoD (multilingual AR)
|
||||
-1001272529767: "Ortadoğu", # Middle East News
|
||||
-1001822461311: "Ortadoğu", # JHArnous
|
||||
-1002263475135: "Ortadoğu", # Syrian FM
|
||||
-1001226363458: "Ortadoğu", # Stay Free
|
||||
-1001048133085: "Ortadoğu", # تَأكّدْ
|
||||
-1001081687249: "Ortadoğu", # مركز الزيتونة AR
|
||||
-1001147346052: "Ortadoğu", # Al-Zaytouna EN
|
||||
-1002142228056: "Ortadoğu", # Elly_bar Israel-Hamas
|
||||
-1001797479924: "Ortadoğu", # بيان نيوز
|
||||
-1001180533415: "Ortadoğu", # Orient - أورينت
|
||||
-1001463836083: "Ortadoğu", # Suriye Milli Ordusu
|
||||
-1002280669663: "Ortadoğu", # خیابون انقلاب
|
||||
-1002450267230: "Ortadoğu", # خیابون انقلاب (dup)
|
||||
|
||||
# --- Askeri & Jeopolitik ---
|
||||
-1001173129471: "Askeri Jeo", # AZERTAC
|
||||
-1002143761332: "Askeri Jeo", # Askeri İstihbarat Sohbet
|
||||
-1001508782705: "Askeri Jeo", # 3. Dünya Savaşı
|
||||
-1001802903419: "Askeri Jeo", # Askeri İstihbarat TR
|
||||
-1001220118870: "Askeri Jeo", # Enformasyon
|
||||
-1001251299061: "Askeri Jeo", # SouthFront
|
||||
-1001699619673: "Askeri Jeo", # The Grayzone
|
||||
-1001689501969: "Askeri Jeo", # Fokus+
|
||||
-1001734228215: "Askeri Jeo", # People's Daily China
|
||||
-1001810182217: "Askeri Jeo", # Rerum Novarum
|
||||
-1002642181270: "Askeri Jeo", # Gallipoli General
|
||||
-1001834311682: "Askeri Jeo", # SOFTAÇAM
|
||||
-1001857092414: "Askeri Jeo", # FahrettinAltay_
|
||||
-1002334106447: "Askeri Jeo", # Source News
|
||||
-1001601338144: "Askeri Jeo", # ASKERİ HARP
|
||||
-1002388640996: "Askeri Jeo", # Military Vibe
|
||||
-1001055365200: "Askeri Jeo", # Nairobi News
|
||||
-1001127820109: "Askeri Jeo", # Bellingcat
|
||||
-990795574: "Askeri Jeo", # Milli Güvenlik Kurulu
|
||||
-1001381692248: "Askeri Jeo", # Rusya Ankara Büyükelçiliği
|
||||
|
||||
# --- E-Kitap ---
|
||||
-1001968002316: "E-Kitap", # E Kütüphanem
|
||||
-1001295770478: "E-Kitap", # Kitap Turşusu Premium
|
||||
-1001273763604: "E-Kitap", # Kitap Evreni
|
||||
-1001176839029: "E-Kitap", # e-kitap yardımlaşma
|
||||
-1003179138041: "E-Kitap", # Kitap Botu PDF
|
||||
-1001948357383: "E-Kitap", # PDF E Kitap İstek
|
||||
-1001267622915: "E-Kitap", # E Kitap Grup
|
||||
-1003339908160: "E-Kitap", # Kitap Arama Grubu
|
||||
-1001884485811: "E-Kitap", # Kitaplık Rafı
|
||||
-1001219338945: "E-Kitap", # e-Babil Kütüphanesi
|
||||
-1002761890261: "E-Kitap", # E kitap Roman PDF
|
||||
-1001379065337: "E-Kitap", # Dijital Kitap
|
||||
-1001436274859: "E-Kitap", # E-Kitap Oku
|
||||
-1002231474242: "E-Kitap", # E - Kitap PDF
|
||||
-1001837236620: "E-Kitap", # E-Kitap Paylaşım Sohbet
|
||||
-1001896451121: "E-Kitap", # Kitap Modu
|
||||
-1002123805391: "E-Kitap", # Kitap PDF Arşivi Roman Hikaye
|
||||
-1001651874667: "E-Kitap", # E Kitap PDF
|
||||
-1001869548408: "E-Kitap", # Aranan Kitapçık duyuru
|
||||
-1001741842267: "E-Kitap", # PDF Kitap Evreni
|
||||
-1002844665098: "E-Kitap", # PDF KİTAP
|
||||
-1001379762150: "E-Kitap", # Atatürk Pdf Kitap
|
||||
-1001380972711: "E-Kitap", # BUNDLE Kitap epub pdf
|
||||
-1002677555843: "E-Kitap", # Büyük Kitap Arşivi
|
||||
-1002084828902: "E-Kitap", # Kitapçı PDF Arşivi
|
||||
-1002502833110: "E-Kitap", # PDF KİTAP ROMAN HİKAYE
|
||||
-1002969079664: "E-Kitap", # PDF KİTAP ARŞİV
|
||||
-1003491537567: "E-Kitap", # YATIRIM KİTAPLARI
|
||||
-1001625595378: "E-Kitap", # Kütübhâne-i Tevârîh
|
||||
-1001616159980: "E-Kitap", # Books
|
||||
-1001916886683: "E-Kitap", # PDF Kitap İndir
|
||||
-1002066019978: "E-Kitap", # PDF Kitap Yurdu
|
||||
-1002233958112: "E-Kitap", # PDF Kitaplar pdfstok
|
||||
-1002739085389: "E-Kitap", # Telegram Kitap Grupları
|
||||
-1003450748883: "E-Kitap", # KÜTÜPHANE
|
||||
|
||||
# --- Sesli & Manga ---
|
||||
-1001651817526: "E-Kitap" , # RiF Новеллы, ранобэ и фф
|
||||
-1001559096136: "E-Kitap" , # Sesli Kitap
|
||||
-1002267174397: "E-Kitap" , # Aaron Arşiv
|
||||
-1001851524017: "E-Kitap" , # Hentai TV
|
||||
-1003037921710: "E-Kitap" , # Sesli Kitap Storytel
|
||||
-1003026138059: "E-Kitap" , # Sesli Kitap Dinlio
|
||||
-1003106544769: "E-Kitap" , # SESLİ KİTAP EDEBİYAT
|
||||
-1003483217842: "E-Kitap" , # MANGA KİTAPLARI
|
||||
-1002269816836: "E-Kitap" , # Anime Maniaxx
|
||||
-1003179794694: "E-Kitap" , # Dergi PDF Arşivi
|
||||
-1001519763115: "E-Kitap" , # Sesli Kitap Dinle
|
||||
-1003417275151: "E-Kitap" , # ÇİZGİ ROMAN KİTAPLARI
|
||||
|
||||
# --- KPSS & YKS ---
|
||||
-1002788272998: "E-Kitap" , # AGS KPSS PDF
|
||||
-1002967115062: "E-Kitap" , # YDS YÖKDİL
|
||||
-1002335523660: "E-Kitap" , # KPSS YKS KİTAP PDF
|
||||
-1002164684267: "E-Kitap" , # Yks PDF AYT TYT
|
||||
-1003029282639: "E-Kitap" , # KİTAP PDF YKS KPSS
|
||||
-1003332920930: "E-Kitap" , # SINAV KAYNAKLARI
|
||||
|
||||
# --- Dil & Kurs ---
|
||||
-1001279165634: "Dil & Kurs", # Udemy Courses Free
|
||||
-1001498152897: "Dil & Kurs", # Eduonix Courses Free
|
||||
-1001005463014: "Dil & Kurs", # PacktPub Free Learning
|
||||
-1001044241441: "Dil & Kurs", # Books Mania (grammar)
|
||||
-1002973548671: "Dil & Kurs", # RUSÇA ÖĞREN SOHBET
|
||||
-1001541869122: "Dil & Kurs", # I speak russian
|
||||
-1001205656183: "Dil & Kurs", # Russian Microlearning
|
||||
-1001262177780: "Dil & Kurs", # Russian With Max
|
||||
-1002374924223: "Dil & Kurs", # LLama Russian Study
|
||||
-1001475363663: "Dil & Kurs", # LEARN SWAHILI
|
||||
-1001654101128: "Dil & Kurs", # Russian for lunch
|
||||
-1001912229645: "Dil & Kurs", # Russian home
|
||||
-1001933331449: "Dil & Kurs", # Study Russian
|
||||
-1002647054427: "Dil & Kurs", # Tutorial for new joiners
|
||||
-1003023929968: "Dil & Kurs", # RUSÇA ÖĞRENİYORUM 2025
|
||||
-1001159423770: "Dil & Kurs", # English Books Magazines Novels
|
||||
-4509421355: "Dil & Kurs", # Russian LLama
|
||||
-1001612802963: "Dil & Kurs", # Vitabu vya Kiislamu (Swahili)
|
||||
-1001807530830: "Dil & Kurs", # Ankara Rus Evi
|
||||
|
||||
# --- Sosyal & Diğer ---
|
||||
-1001379307100: "Sosyal" , # Ламповая беседка
|
||||
-1001887551302: "Sosyal" , # ТРОЕТОЧИЕ
|
||||
-1001492338580: "Sosyal" , # Sirius poets
|
||||
-1001751338081: "Sosyal" , # Geometric Telegramssion
|
||||
-1001760743689: "Sosyal" , # Квартал красных фонарей
|
||||
-1001714372021: "Sosyal" , # Kaktüs v2.0
|
||||
-1002464236122: "Sosyal" , # Malvinkin Twitch
|
||||
-1001865528673: "Sosyal" , # Fiftnmls
|
||||
-1002466936546: "Finans" , # İnfo Yatırım Hisse
|
||||
-1001961199646: "Sosyal" , # аничух (twitch)
|
||||
-1003321701261: "Finans" , # ADRENALİN TRADE
|
||||
-1001613153861: "Sosyal" , # FOŞİX ERLİK
|
||||
-1001495437712: "Sosyal" , # Erlik Video Deposu
|
||||
-1001363595671: "Finans" , # Advo
|
||||
-1001874359773: "Finans" , # Udemy Türkçe (stock tips)
|
||||
-1001591294939: "Finans" , # Hazine-i BORSA
|
||||
-4789484210: "Finans" , # Trade
|
||||
-1002517056894: "Sosyal" , # Барахолка Москва
|
||||
-1003222134628: "Sosyal" , # GAME CHILL
|
||||
-1001476005114: "Finans" , # cBank
|
||||
-901188134: "Sosyal" , # CHP GENÇLİK
|
||||
-4543354861: "Sosyal" , # Atatürkçüler Birliği
|
||||
-693237968: "Sosyal" , # İzmir Kavram
|
||||
-525645675: "Sosyal" , # GB - Jeoloji
|
||||
-567627579: "Sosyal" , # GB - Psikoloji
|
||||
-562687596: "Sosyal" , # GB - tarih
|
||||
-537589148: "Sosyal" , # GB - tıp
|
||||
-541262586: "Sosyal" , # GB - kimya
|
||||
-516377683: "Sosyal" , # GB - biyoloji
|
||||
-500211559: "Sosyal" , # GB - mühendislik
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
here = Path(__file__).parent
|
||||
channels = json.loads((here / "data" / "channels.json").read_text(encoding="utf-8"))
|
||||
|
||||
channel_ids = {c["id"] for c in channels}
|
||||
assigned_ids = set(A.keys())
|
||||
|
||||
missing = channel_ids - assigned_ids
|
||||
extra = assigned_ids - channel_ids
|
||||
if missing:
|
||||
print("⚠ assignment eksik:")
|
||||
for mid in missing:
|
||||
name = next((c["name"] for c in channels if c["id"] == mid), "?")
|
||||
print(f" {mid} {name!r}")
|
||||
if extra:
|
||||
print("⚠ assignment'da fazladan ID var:", extra)
|
||||
|
||||
assignments_str = {str(k): v for k, v in A.items()}
|
||||
counts: dict[str, int] = {}
|
||||
for v in A.values():
|
||||
counts[v] = counts.get(v, 0) + 1
|
||||
|
||||
out = here / "data" / "assignments.json"
|
||||
out.write_text(
|
||||
json.dumps(
|
||||
{"folders": FOLDERS, "assignments": assignments_str},
|
||||
ensure_ascii=False, indent=2,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
print(f"\n✓ {len(A)} atama → {out}")
|
||||
for f in FOLDERS:
|
||||
n = counts.get(f["title"], 0)
|
||||
print(f" {f['emoticon']} {f['title']:<22} {n:>3}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
35
personas/_shared/skills/telegram/scripts/categorize.py
Normal file
35
personas/_shared/skills/telegram/scripts/categorize.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
ADIM 2 — Sınıflandırma sonucunu (ID → klasör) data/assignments.json'dan okur.
|
||||
|
||||
assignments.json formatı:
|
||||
{
|
||||
"folders": ["Klasör1", "Klasör2", ...], # tam 10 tane
|
||||
"assignments": { "<channel_id>": "Klasör1", ... }
|
||||
}
|
||||
|
||||
Bu dosyayı Claude (ben) data/channels.json'ı analiz edip üretir.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
_ASSIGN_FILE = Path(__file__).parent / "data" / "assignments.json"
|
||||
_cache: dict | None = None
|
||||
|
||||
|
||||
def _load() -> dict:
|
||||
global _cache
|
||||
if _cache is None:
|
||||
if not _ASSIGN_FILE.exists():
|
||||
raise FileNotFoundError(
|
||||
f"{_ASSIGN_FILE} yok. Önce data/channels.json üretilmeli, "
|
||||
"sonra Claude assignments.json'u yazacak."
|
||||
)
|
||||
_cache = json.loads(_ASSIGN_FILE.read_text(encoding="utf-8"))
|
||||
return _cache
|
||||
|
||||
|
||||
def categorize(channel: dict) -> str | None:
|
||||
data = _load()
|
||||
return data["assignments"].get(str(channel["id"]))
|
||||
9
personas/_shared/skills/telegram/scripts/config.py
Normal file
9
personas/_shared/skills/telegram/scripts/config.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
_API_TXT = Path(__file__).parent / "api.txt"
|
||||
_text = _API_TXT.read_text(encoding="utf-8")
|
||||
|
||||
API_ID = int(re.search(r"api_id:\s*\n?\s*(\d+)", _text).group(1))
|
||||
API_HASH = re.search(r"api_hash:\s*\n?\s*([a-f0-9]+)", _text).group(1)
|
||||
SESSION_NAME = str(Path(__file__).parent / "telegram_session")
|
||||
63
personas/_shared/skills/telegram/scripts/fetch_all.py
Normal file
63
personas/_shared/skills/telegram/scripts/fetch_all.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""
|
||||
ADIM 1 — Her grup/kanal + son mesajları çek, data/channels.json'a kaydet.
|
||||
|
||||
Arşivli olanlar da dahil (iter_dialogs(archived=None) hem normal hem arşivli getirir).
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
from config import API_ID, API_HASH, SESSION_NAME
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
OUTPUT = DATA_DIR / "channels.json"
|
||||
|
||||
MESSAGE_SAMPLE = 40 # her kanaldan kaç mesaj
|
||||
MESSAGE_CHAR_LIMIT = 600 # her mesaj max uzunluk
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
me = await client.get_me()
|
||||
print(f"Bağlandı: @{me.username or me.first_name}\n")
|
||||
|
||||
results: list[dict] = []
|
||||
async for d in client.iter_dialogs(archived=None):
|
||||
if not (d.is_group or d.is_channel):
|
||||
continue
|
||||
|
||||
idx = len(results) + 1
|
||||
print(f"[{idx:>3}] {d.name} (arşiv={bool(d.archived)})")
|
||||
|
||||
messages: list[str] = []
|
||||
try:
|
||||
async for msg in client.iter_messages(d.entity, limit=MESSAGE_SAMPLE):
|
||||
text = (msg.message or "").strip()
|
||||
if text:
|
||||
messages.append(text[:MESSAGE_CHAR_LIMIT])
|
||||
except Exception as e:
|
||||
print(f" ! mesaj çekilemedi: {e}")
|
||||
|
||||
results.append({
|
||||
"id": d.id,
|
||||
"name": d.name or "",
|
||||
"type": "channel" if (d.is_channel and not d.is_group) else "group",
|
||||
"is_broadcast": bool(getattr(d.entity, "broadcast", False)),
|
||||
"archived": bool(d.archived),
|
||||
"unread_count": d.unread_count,
|
||||
"messages": messages,
|
||||
})
|
||||
|
||||
OUTPUT.write_text(
|
||||
json.dumps(results, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
print(f"\n✓ {len(results)} sohbet kaydedildi -> {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1 @@
|
||||
telethon>=1.43.1
|
||||
77
personas/_shared/skills/telegram/scripts/tg_inbox.py
Normal file
77
personas/_shared/skills/telegram/scripts/tg_inbox.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Show unread chats — your real inbox view.
|
||||
|
||||
Usage:
|
||||
python tg_inbox.py # all unread, sorted by count desc
|
||||
python tg_inbox.py --top 20
|
||||
python tg_inbox.py --include-archived
|
||||
python tg_inbox.py --mark-read "Born2bero" # zero-out a specific chat
|
||||
python tg_inbox.py --mark-all-read --yes # nuke ALL unread (destructive)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
from config import API_HASH, API_ID, SESSION_NAME
|
||||
from tg_utils import confirm, resolve_chat
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
p.add_argument("--top", type=int, default=0, help="show only top N (default: all)")
|
||||
p.add_argument("--include-archived", action="store_true",
|
||||
help="include archived dialogs (default: only normal)")
|
||||
p.add_argument("--mark-read", help="mark this specific chat as read")
|
||||
p.add_argument("--mark-all-read", action="store_true",
|
||||
help="mark every unread chat as read (DESTRUCTIVE)")
|
||||
p.add_argument("--yes", "-y", action="store_true", help="skip confirmation")
|
||||
args = p.parse_args()
|
||||
|
||||
archived = None if args.include_archived else False
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
# Single-target mark-read
|
||||
if args.mark_read:
|
||||
entity = await resolve_chat(client, args.mark_read)
|
||||
await client.send_read_acknowledge(entity)
|
||||
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
|
||||
print(f"✓ {title} marked as read")
|
||||
return
|
||||
|
||||
# Collect unread
|
||||
unread: list[tuple[int, str, int, bool]] = []
|
||||
async for d in client.iter_dialogs(archived=archived):
|
||||
if d.unread_count > 0:
|
||||
kind = "channel" if (d.is_channel and not d.is_group) else (
|
||||
"group" if d.is_group else "user")
|
||||
unread.append((d.unread_count, d.name or str(d.id), d.id, d.is_channel))
|
||||
# store original Dialog for later mark-read pass
|
||||
unread[-1] = (d.unread_count, d.name or str(d.id), d.id, kind)
|
||||
|
||||
unread.sort(reverse=True)
|
||||
if args.top:
|
||||
unread = unread[:args.top]
|
||||
|
||||
total = sum(n for n, *_ in unread)
|
||||
print(f"# {len(unread)} unread chats — {total} unread messages\n")
|
||||
for n, name, cid, kind in unread:
|
||||
print(f" {n:>5} [{kind:<7}] {name} (id={cid})")
|
||||
|
||||
# Mark-all-read
|
||||
if args.mark_all_read:
|
||||
print()
|
||||
if not args.yes and not confirm(f"Mark ALL {len(unread)} chats as read?"):
|
||||
print("iptal.")
|
||||
return
|
||||
for _, name, cid, _ in unread:
|
||||
try:
|
||||
await client.send_read_acknowledge(await client.get_input_entity(cid))
|
||||
print(f" ✓ {name}")
|
||||
except Exception as e:
|
||||
print(f" ! {name}: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
71
personas/_shared/skills/telegram/scripts/tg_read.py
Normal file
71
personas/_shared/skills/telegram/scripts/tg_read.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Read messages from a Telegram chat.
|
||||
|
||||
Usage:
|
||||
python tg_read.py "@username"
|
||||
python tg_read.py "Born2beroot" --limit 50
|
||||
python tg_read.py -1001182095274 --since 2026-04-01
|
||||
python tg_read.py "@durov" --limit 5 --json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
from config import API_HASH, API_ID, SESSION_NAME
|
||||
from tg_utils import fmt_msg, parse_date, resolve_chat
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
p.add_argument("chat", help="@username, numeric id, or name substring")
|
||||
p.add_argument("--limit", "-n", type=int, default=20, help="max messages (default 20)")
|
||||
p.add_argument("--since", help="YYYY-MM-DD; only newer than this date")
|
||||
p.add_argument("--search", "-s", help="filter to messages containing this text")
|
||||
p.add_argument("--json", action="store_true", help="emit JSON instead of table")
|
||||
p.add_argument("--mark-read", action="store_true", help="mark fetched messages as read")
|
||||
args = p.parse_args()
|
||||
|
||||
offset_date = parse_date(args.since) if args.since else None
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
entity = await resolve_chat(client, args.chat)
|
||||
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
|
||||
|
||||
if not args.json:
|
||||
print(f"# {title} (id={entity.id})\n")
|
||||
|
||||
kwargs = {"limit": args.limit}
|
||||
if offset_date:
|
||||
kwargs["reverse"] = True
|
||||
kwargs["offset_date"] = offset_date
|
||||
if args.search:
|
||||
kwargs["search"] = args.search
|
||||
|
||||
rows = []
|
||||
async for msg in client.iter_messages(entity, **kwargs):
|
||||
if args.json:
|
||||
rows.append({
|
||||
"id": msg.id,
|
||||
"date": msg.date.isoformat(),
|
||||
"sender_id": msg.sender_id,
|
||||
"text": msg.message or "",
|
||||
"has_media": msg.media is not None,
|
||||
"reply_to": msg.reply_to_msg_id,
|
||||
})
|
||||
else:
|
||||
print(fmt_msg(msg))
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(rows, ensure_ascii=False, indent=2))
|
||||
|
||||
if args.mark_read:
|
||||
await client.send_read_acknowledge(entity)
|
||||
if not args.json:
|
||||
print(f"\n✓ {title} marked as read")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
72
personas/_shared/skills/telegram/scripts/tg_search.py
Normal file
72
personas/_shared/skills/telegram/scripts/tg_search.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Search messages — globally or scoped to one chat.
|
||||
|
||||
Usage:
|
||||
python tg_search.py "CVE-2024" # global, last 50 hits
|
||||
python tg_search.py "kitap" --chat "E Kitap" # scoped to one chat
|
||||
python tg_search.py "Putin" --since 2026-04-01 --limit 100
|
||||
python tg_search.py "report" --chat me # only Saved Messages
|
||||
|
||||
Global search uses Telegram's server-side message index (telethon
|
||||
client.iter_messages(None, search=...)).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
from config import API_HASH, API_ID, SESSION_NAME
|
||||
from tg_utils import fmt_msg, parse_date, resolve_chat
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
p.add_argument("query", help="search text")
|
||||
p.add_argument("--chat", help="restrict to this chat (@user/id/name/me)")
|
||||
p.add_argument("--limit", "-n", type=int, default=50)
|
||||
p.add_argument("--since", help="YYYY-MM-DD lower bound")
|
||||
args = p.parse_args()
|
||||
|
||||
offset_date = parse_date(args.since) if args.since else None
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
if args.chat:
|
||||
entity = await resolve_chat(client, "me" if args.chat in {"me", "self"} else args.chat)
|
||||
else:
|
||||
entity = None
|
||||
|
||||
kwargs = {"search": args.query, "limit": args.limit}
|
||||
if offset_date:
|
||||
kwargs["reverse"] = True
|
||||
kwargs["offset_date"] = offset_date
|
||||
|
||||
# Cache chat titles to annotate global hits.
|
||||
chat_titles: dict[int, str] = {}
|
||||
|
||||
async def title_for(chat_id: int) -> str:
|
||||
if chat_id in chat_titles:
|
||||
return chat_titles[chat_id]
|
||||
try:
|
||||
e = await client.get_entity(chat_id)
|
||||
t = getattr(e, "title", None) or getattr(e, "username", None) or str(chat_id)
|
||||
except Exception:
|
||||
t = str(chat_id)
|
||||
chat_titles[chat_id] = t
|
||||
return t
|
||||
|
||||
count = 0
|
||||
async for msg in client.iter_messages(entity, **kwargs):
|
||||
count += 1
|
||||
if entity is None:
|
||||
where = await title_for(msg.peer_id.channel_id) if hasattr(msg.peer_id, "channel_id") \
|
||||
else await title_for(getattr(msg.peer_id, "user_id", 0) or getattr(msg.peer_id, "chat_id", 0))
|
||||
print(f"[{where[:25]:<25}] {fmt_msg(msg)}")
|
||||
else:
|
||||
print(fmt_msg(msg))
|
||||
|
||||
print(f"\n{count} hit(s)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
90
personas/_shared/skills/telegram/scripts/tg_send.py
Normal file
90
personas/_shared/skills/telegram/scripts/tg_send.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""Send a message (text and/or file) to a Telegram chat.
|
||||
|
||||
Usage:
|
||||
python tg_send.py "@username" "Hello"
|
||||
python tg_send.py "Born2beroot" "Check this" --file report.pdf
|
||||
python tg_send.py "@chan" "" --file image.png --caption "screenshot"
|
||||
python tg_send.py "Saved Messages" "note to self" --yes
|
||||
python tg_send.py "@x" "Reply" --reply-to 12345
|
||||
python tg_send.py "@x" "Quiet ping" --silent
|
||||
|
||||
Defaults to dry-run preview + interactive [y/N] confirm. --yes skips it.
|
||||
Saved Messages is resolvable as "me" or by your own username.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
from config import API_HASH, API_ID, SESSION_NAME
|
||||
from tg_utils import confirm, resolve_chat
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
p.add_argument("chat", help='@username, id, name; "me" for Saved Messages')
|
||||
p.add_argument("text", help="message text (use '' if only sending a file)")
|
||||
p.add_argument("--file", "-f", help="path to file/image to attach")
|
||||
p.add_argument("--caption", help="caption for the file (overrides text if --file given)")
|
||||
p.add_argument("--reply-to", type=int, help="message id to reply to")
|
||||
p.add_argument("--silent", action="store_true", help="send without notification")
|
||||
p.add_argument("--parse", choices=["md", "html", "none"], default="md",
|
||||
help="text parse mode (default: md)")
|
||||
p.add_argument("--yes", "-y", action="store_true", help="skip confirmation")
|
||||
args = p.parse_args()
|
||||
|
||||
if not args.text and not args.file:
|
||||
sys.exit("nothing to send: provide text and/or --file")
|
||||
|
||||
if args.file and not Path(args.file).exists():
|
||||
sys.exit(f"file not found: {args.file}")
|
||||
|
||||
parse_mode = None if args.parse == "none" else args.parse
|
||||
|
||||
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
|
||||
entity = await resolve_chat(client, "me" if args.chat in {"me", "self"} else args.chat)
|
||||
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
|
||||
|
||||
print(f"→ to: {title} (id={entity.id})")
|
||||
if args.file:
|
||||
print(f"→ file: {args.file}")
|
||||
print(f"→ cap: {(args.caption or args.text)[:120]}")
|
||||
else:
|
||||
preview = args.text if len(args.text) < 200 else args.text[:200] + "…"
|
||||
print(f"→ text: {preview}")
|
||||
if args.reply_to:
|
||||
print(f"→ reply: msg #{args.reply_to}")
|
||||
if args.silent:
|
||||
print("→ silent: yes")
|
||||
|
||||
if not args.yes and not confirm("Gönder?"):
|
||||
print("iptal.")
|
||||
return
|
||||
|
||||
if args.file:
|
||||
sent = await client.send_file(
|
||||
entity,
|
||||
args.file,
|
||||
caption=args.caption or args.text or None,
|
||||
reply_to=args.reply_to,
|
||||
silent=args.silent,
|
||||
parse_mode=parse_mode,
|
||||
)
|
||||
else:
|
||||
sent = await client.send_message(
|
||||
entity,
|
||||
args.text,
|
||||
reply_to=args.reply_to,
|
||||
silent=args.silent,
|
||||
parse_mode=parse_mode,
|
||||
)
|
||||
|
||||
print(f"✓ sent msg id={sent.id}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
100
personas/_shared/skills/telegram/scripts/tg_utils.py
Normal file
100
personas/_shared/skills/telegram/scripts/tg_utils.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Shared helpers for the tg_* CLI scripts (read/send/search/inbox)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
|
||||
async def resolve_chat(client: TelegramClient, ref: str):
|
||||
"""Resolve a chat reference to a Telethon entity.
|
||||
|
||||
Accepts:
|
||||
- "@username" or "username" (with leading '+' for invite phone-number)
|
||||
- numeric id (positive or negative; large negative for supergroups)
|
||||
- case-insensitive name substring; errors if 0 or >1 matches
|
||||
"""
|
||||
ref = ref.strip()
|
||||
|
||||
if ref.startswith("@") or ref.startswith("+"):
|
||||
return await client.get_entity(ref)
|
||||
|
||||
try:
|
||||
return await client.get_entity(int(ref))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
needle = ref.lower()
|
||||
matches = []
|
||||
async for d in client.iter_dialogs(archived=None):
|
||||
if needle in (d.name or "").lower():
|
||||
matches.append(d)
|
||||
|
||||
if not matches:
|
||||
sys.exit(f"chat not found: {ref!r}")
|
||||
if len(matches) > 1:
|
||||
preview = "\n ".join(f"{d.id:>15} {d.name}" for d in matches[:10])
|
||||
more = "" if len(matches) <= 10 else f"\n ... +{len(matches)-10} more"
|
||||
sys.exit(
|
||||
f"ambiguous chat {ref!r} ({len(matches)} matches):\n "
|
||||
f"{preview}{more}\nuse the numeric id or @username"
|
||||
)
|
||||
return matches[0].entity
|
||||
|
||||
|
||||
def parse_date(s: str) -> datetime:
|
||||
"""Parse YYYY-MM-DD or full ISO into UTC-aware datetime."""
|
||||
if "T" in s or " " in s:
|
||||
dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
|
||||
else:
|
||||
dt = datetime.strptime(s, "%Y-%m-%d")
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt
|
||||
|
||||
|
||||
def fmt_msg(msg, max_chars: int = 200) -> str:
|
||||
"""Compact one-line representation of a Telethon Message."""
|
||||
sender = ""
|
||||
if getattr(msg, "sender", None) is not None:
|
||||
sender = (
|
||||
getattr(msg.sender, "username", None)
|
||||
or getattr(msg.sender, "first_name", None)
|
||||
or getattr(msg.sender, "title", None)
|
||||
or str(msg.sender_id)
|
||||
)
|
||||
elif msg.sender_id:
|
||||
sender = str(msg.sender_id)
|
||||
|
||||
text = (msg.message or "").replace("\n", " ⏎ ")
|
||||
if len(text) > max_chars:
|
||||
text = text[: max_chars - 1] + "…"
|
||||
media = ""
|
||||
if msg.media and not msg.message:
|
||||
media = f" [media:{type(msg.media).__name__}]"
|
||||
return f"{msg.id:>9} │ {msg.date.strftime('%Y-%m-%d %H:%M')} │ {sender[:20]:<20} │ {text}{media}"
|
||||
|
||||
|
||||
def confirm(prompt: str = "Onayla", default: bool = False) -> bool:
|
||||
"""Interactive y/N. default=False → [y/N], default=True → [Y/n]."""
|
||||
suffix = " [Y/n]: " if default else " [y/N]: "
|
||||
try:
|
||||
r = input(prompt + suffix).strip().lower()
|
||||
except EOFError:
|
||||
return default
|
||||
if not r:
|
||||
return default
|
||||
return r in ("y", "yes", "evet", "e", "ok")
|
||||
|
||||
|
||||
def chunked(items: Iterable, size: int):
|
||||
buf = []
|
||||
for x in items:
|
||||
buf.append(x)
|
||||
if len(buf) == size:
|
||||
yield buf
|
||||
buf = []
|
||||
if buf:
|
||||
yield buf
|
||||
Reference in New Issue
Block a user