Add ekos-gazete-search FullSweep + telegram + browser-use skills

Skills:
- ekos-gazete-search: EKOS gazete arşivi (1928-1942) tarama skill'i.
  + 04_export.py (CSV+DOCX), run_capped.sh (systemd cap wrapper),
    02_search_pdfs.py interleaved-dispatch patch (crash-safe), kirim_core.yaml.
- telegram: TG inbox/search/send/read scripts.
- browser-use: paperclip browser automation skill.

build.py:
- Add ekos-gazete-search → scribe, scholar, oracle, frodo, chronos,
  centurion, wraith mapping.
- Add telegram, browser-use mappings (browser-use uses "*" wildcard).
- Add wildcard "*" support in DEFAULT_SKILL_PERSONA_MAP.
- Add paperclip_skills + community_skills buckets to skill injection.
- Wrap yaml.safe_load in try/except for malformed frontmatter.
- Index paperclip_skills with inferred persona mapping.

README.md:
- Add telegram skill to Sentinel/Frodo/Oracle/Echo skill lists.
This commit is contained in:
salvacybersec
2026-04-30 20:45:31 +03:00
parent 3126dadd19
commit 00dc88bf5f
28 changed files with 3316 additions and 13 deletions

View File

@@ -84,17 +84,17 @@ cat generated/sentinel/apt-profiling.yaml # YAML with metadata
| **Specter** | Malware Analyst / Reverse Engineer | Cerrah | general, firmware | — |
| **Bastion** | Blue Team / DFIR | Muhafız | general, forensics, threat-hunting, incident-commander | senior-secops, sys-guard-linux-remediator, pcap-analyzer |
| **Vortex** | Network Ops / Traffic Analysis | Telsizci | general, cloud-ad | nmap-recon, pcap-analyzer, dns-networking |
| **Sentinel** | CTI / Threat Intelligence | İzci | general, apt-profiling, mitre-attack, darknet, **c2-hunting** | seithar-intel, gov-cybersecurity, pentest-c2-operator |
| **Sentinel** | CTI / Threat Intelligence | İzci | general, apt-profiling, mitre-attack, darknet, **c2-hunting** | seithar-intel, gov-cybersecurity, pentest-c2-operator, telegram |
### Intelligence (5 personas, 29 variants)
| Codename | Role | Hitap | Variants | Skills |
|----------|------|-------|----------|--------|
| **Frodo** | Strategic Intelligence Analyst | Müsteşar | general, middle-east, russia, iran, africa, china, pakistan, india, nato-alliance, nuclear, energy-geopolitics, turkey, salva | freshrss, freshrss-reader, seithar-intel, war-intel-monitor, news-crawler, dellight-intelligence-ops, dellight-strategic-intelligence |
| **Oracle** | OSINT & Digital Intelligence | Kaşif | general, crypto-osint, **source-verification**, salva | osint-investigator, stealth-browser, deep-scraper, crawl-for-ai, image-ocr, mistral-ocr, freshrss +2 |
| **Frodo** | Strategic Intelligence Analyst | Müsteşar | general, middle-east, russia, iran, africa, china, pakistan, india, nato-alliance, nuclear, energy-geopolitics, turkey, salva | freshrss, freshrss-reader, seithar-intel, war-intel-monitor, news-crawler, dellight-intelligence-ops, dellight-strategic-intelligence, telegram |
| **Oracle** | OSINT & Digital Intelligence | Kaşif | general, crypto-osint, **source-verification**, salva | osint-investigator, stealth-browser, deep-scraper, crawl-for-ai, image-ocr, mistral-ocr, freshrss, telegram +2 |
| **Ghost** | PSYOP & Information Warfare | Propagandist | general, cognitive-warfare, russian-info-war, salva | social-trust-manipulation-detector |
| **Wraith** | HUMINT & Counter-Intelligence | Mahrem | general, source-validation, case-studies, salva | — |
| **Echo** | SIGINT / COMINT / ELINT | Kulakçı | general, nsa-sigint, electronic-order-of-battle, salva | dellight-intelligence-ops |
| **Echo** | SIGINT / COMINT / ELINT | Kulakçı | general, nsa-sigint, electronic-order-of-battle, salva | dellight-intelligence-ops, telegram |
### Military & Strategy (4 personas, 24 variants)

View File

@@ -258,7 +258,12 @@ def build_persona(
# Inject mapped skills for this persona
if skills_index:
mapped_skills = []
for bucket in ("skills", "feynman_skills"):
for bucket in (
"skills",
"paperclip_skills",
"community_skills",
"feynman_skills",
):
for skill_name, skill_info in skills_index.get(bucket, {}).items():
if not isinstance(skill_info, dict):
continue
@@ -306,6 +311,8 @@ def build_persona(
DEFAULT_SKILL_PERSONA_MAP = {
# Browser automation for every persona
"browser-use": ["*"],
# Cybersecurity skills → personas
"pentest": ["neo"],
"nmap-recon": ["neo", "vortex"],
@@ -336,6 +343,7 @@ DEFAULT_SKILL_PERSONA_MAP = {
"news-crawler": ["frodo", "herald"],
"dellight-intelligence-ops": ["frodo", "echo"],
"dellight-strategic-intelligence": ["frodo"],
"telegram": ["frodo", "oracle", "sentinel", "echo"],
"agent-intelligence-network-scan": ["oracle"],
"social-trust-manipulation-detector": ["ghost"],
# Infrastructure skills → personas
@@ -349,6 +357,8 @@ DEFAULT_SKILL_PERSONA_MAP = {
# Web scraping → personas
"deep-scraper": ["oracle"],
"crawl-for-ai": ["oracle", "herald"],
# Historical / archival research → personas
"ekos-gazete-search": ["scribe", "scholar", "oracle", "frodo", "chronos", "centurion", "wraith"],
}
@@ -391,7 +401,10 @@ def parse_skill_frontmatter(skill_md: Path) -> dict:
fm_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
if not fm_match:
return {}
parsed = yaml.safe_load(fm_match.group(1))
try:
parsed = yaml.safe_load(fm_match.group(1))
except yaml.YAMLError:
return {}
return parsed if isinstance(parsed, dict) else {}
@@ -514,13 +527,18 @@ def infer_personas_from_skill_metadata(skill_name: str, metadata: dict) -> list:
def load_skill_persona_map(config: dict) -> dict:
"""Load skill→persona mapping from config.yaml or use defaults."""
custom = config.get("skill_persona_map", {})
merged = {
k: [p for p in v if p in VALID_PERSONAS]
for k, v in DEFAULT_SKILL_PERSONA_MAP.items()
}
merged = {}
for skill, personas in DEFAULT_SKILL_PERSONA_MAP.items():
if "*" in personas:
merged[skill] = sorted(VALID_PERSONAS)
else:
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
for skill, personas in custom.items():
if isinstance(personas, list):
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
if "*" in personas:
merged[skill] = sorted(VALID_PERSONAS)
else:
merged[skill] = [p for p in personas if p in VALID_PERSONAS]
return merged
@@ -718,7 +736,35 @@ def build_skills_index(shared_dir: Path, config: dict = None) -> dict:
continue
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
index["paperclip_skills"][skill_dir.name] = True
skill_meta = parse_skill_frontmatter(skill_md)
inferred_personas = infer_personas_from_skill_metadata(
skill_dir.name, skill_meta
)
configured_personas = skill_map.get(skill_dir.name, [])
merged_personas = sorted(
set(configured_personas).union(inferred_personas)
)
content = skill_md.read_text(encoding="utf-8")
first_line = ""
for line in content.split("\n"):
line = line.strip()
if line and not line.startswith(
("---", "#", "name:", "description:")
):
first_line = line[:120]
break
index["paperclip_skills"][skill_dir.name] = {
"personas": merged_personas,
"summary": first_line,
"domain": str(skill_meta.get("domain", "")),
"subdomain": str(skill_meta.get("subdomain", "")),
"tags": skill_meta.get("tags", []),
"mapped_by": {
"explicit": configured_personas,
"inferred": inferred_personas,
},
"has_references": (skill_dir / "references").is_dir(),
}
# Index community-skills
cskills_dir = shared_dir / "community-skills"
@@ -728,7 +774,35 @@ def build_skills_index(shared_dir: Path, config: dict = None) -> dict:
continue
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
index["community_skills"][skill_dir.name] = True
skill_meta = parse_skill_frontmatter(skill_md)
inferred_personas = infer_personas_from_skill_metadata(
skill_dir.name, skill_meta
)
configured_personas = skill_map.get(skill_dir.name, [])
merged_personas = sorted(
set(configured_personas).union(inferred_personas)
)
content = skill_md.read_text(encoding="utf-8")
first_line = ""
for line in content.split("\n"):
line = line.strip()
if line and not line.startswith(
("---", "#", "name:", "description:")
):
first_line = line[:120]
break
index["community_skills"][skill_dir.name] = {
"personas": merged_personas,
"summary": first_line,
"domain": str(skill_meta.get("domain", "")),
"subdomain": str(skill_meta.get("subdomain", "")),
"tags": skill_meta.get("tags", []),
"mapped_by": {
"explicit": configured_personas,
"inferred": inferred_personas,
},
"has_references": (skill_dir / "references").is_dir(),
}
# Index feynman-skills (research workflows adapted from Feynman).
# Use the same persona-aware indexing as shared skills so mapped skills

View File

@@ -0,0 +1,121 @@
---
name: browser-use
description: Automates browser interactions for web testing, form filling, screenshots, and data extraction. Use when the user needs to navigate websites, interact with web pages, fill forms, take screenshots, or extract information from web pages.
license: MIT
metadata:
author: browser-use
version: "1.1.0"
domain: engineering
subdomain: browser-automation
triggers: browser-use, browser automation, web scraping, form filling, screenshot, cloud browser, playwright cdp, session replay, workspace files, profile sync
role: engineer
scope: implementation
---
# Browser Use
Use Browser Use Cloud SDK and API to run browser agents and raw browser sessions.
## When To Use
- Navigate websites and extract structured data
- Fill forms and execute multi-step workflows
- Stream live browser actions and agent messages
- Reuse sessions, profiles, and workspaces across tasks
- Connect Playwright/Puppeteer via CDP to cloud browsers
## Install
```bash
pip install browser-use-sdk
export BROWSER_USE_API_KEY=your_key
```
TypeScript:
```bash
npm install browser-use-sdk
```
## Quick Start (v3 SDK)
```python
from browser_use_sdk.v3 import AsyncBrowserUse
client = AsyncBrowserUse()
result = await client.run("List the top 20 posts on Hacker News today with their points")
print(result.output)
```
```typescript
import { BrowserUse } from "browser-use-sdk/v3";
const client = new BrowserUse();
const result = await client.run("List the top 20 posts on Hacker News today with their points");
console.log(result.output);
```
## Core Patterns
- `run()` for one-shot tasks: auto create + poll + return output.
- `sessions.create()` + `session_id` for follow-up tasks with shared browser state.
- `workspaces.*` for file upload/download workflows.
- `profiles.*` for login persistence and recurring automation.
- `browsers.create()` for raw CDP control (Playwright/Puppeteer).
### Follow-up task pattern
```python
session = await client.sessions.create()
await client.run("Go to amazon.com and open first laptop", session_id=session.id)
await client.run("Extract customer reviews", session_id=session.id)
await client.sessions.stop(session.id)
```
### Structured output
- Python: pass `output_schema` (Pydantic).
- TypeScript: pass `schema` (Zod v4 required).
### Stream messages
- Iterate over `client.run(...)` to receive live messages.
- `run.result` is valid only after iteration completes.
### Deterministic rerun (cache-script)
- Use `@{{...}}` placeholders in task plus `workspace_id`.
- First run builds script, next runs can execute without LLM.
- `cache_script`: `None` (auto), `True` (force), `False` (disable).
## Agent vs Browser
- Agent mode: `client.run(...)`, `client.sessions.*`.
- Browser mode: `client.browsers.create(...)` returns `cdp_url` + `live_url`.
- Use browser mode when you need custom CDP automation with Playwright/Puppeteer.
## Authentication and Persistence
- API key env: `BROWSER_USE_API_KEY`.
- Header for direct API calls: `X-Browser-Use-API-Key: <key>`.
- For user-specific state: create one profile per user and reuse `profile_id`.
## Operations Checklist
- Always stop sessions/browsers when done to avoid idle charges.
- Always stop profiled sessions to persist cookies/localStorage correctly.
- Sessions idle-timeout after 15 minutes; max duration is 4 hours.
- Recording links are presigned and expire quickly (about 1 hour).
## Common Gotchas
- If streaming loop is interrupted early, cancel with `sessions.stop(..., strategy="task")` before sending another task.
- TypeScript structured output fails with Zod v3; use Zod v4.
- Selenium remote CDP support is limited; prefer Playwright/Puppeteer for cloud CDP.
- Deleting a workspace is permanent.
## Reference
- Full LLM-optimized docs: `https://docs.browser-use.com/llms-full.txt`
- Quick index: `https://docs.browser-use.com/llms.txt`
- API key: `https://cloud.browser-use.com/settings?tab=api-keys&new=1`

View File

@@ -0,0 +1,60 @@
# ekos-gazete-search
Claude Code skill: İstanbul Üniversitesi EKOS gazete arşivinde (1928-1942, 53 gazete, 581 106 OCR'lı sayfa) konu-bazlı sistematik arama.
## Hızlı başlangıç
```bash
cd ~/.claude/skills/ekos-gazete-search
python3 -m venv .venv && source .venv/bin/activate
pip install -r scripts/requirements.txt
# 1) Manifest oluştur (~1 dk, tek seferlik)
python scripts/01_build_manifest.py
# 2) Kırım taramasını öncelikli pencerelerle başlat
python scripts/02_search_pdfs.py \
--keywords keywords/kirim.yaml \
--priority-only \
--workers 4
# 3) Obsidian raporu oluştur
python scripts/03_render_report.py --topic Kirim
```
## Yapı
```
.
├── SKILL.md # Claude'a yönerge
├── README.md # bu dosya
├── keywords/
│ ├── _template.yaml # yeni konu için şablon
│ └── kirim.yaml # Kırım (Hanlık, Tatar, diaspora, Sovyet)
├── scripts/
│ ├── 01_build_manifest.py # 53 gazete sayfasını çek → manifest CSV
│ ├── 02_search_pdfs.py # PDF indir + pdftotext + fuzzy regex → JSONL
│ ├── 03_render_report.py # JSONL → Obsidian markdown
│ ├── lib/fuzzy.py # OCR-toleranslı Türkçe regex motoru
│ └── requirements.txt
├── manifests/ # üretilmiş CSV'ler
└── hits/ # üretilmiş JSONL hit dosyaları
```
## Yeni konu
```bash
cp keywords/_template.yaml keywords/filistin.yaml
# Düzenle: canonical, aliases, proper_nouns, disambiguators, priority_windows
python scripts/02_search_pdfs.py --keywords keywords/filistin.yaml --out hits/filistin.jsonl
python scripts/03_render_report.py --hits hits/filistin.jsonl --topic Filistin
```
## Sınırlar
- **Bant genişliği:** 581k sayfa × ~14MB PDF ≈ 8+ TB. Skill her PDF'i indirir, text-layer çıkarır, hit yoksa siler. Tam mirror YAPMAZ.
- **Throttle:** varsayılan 0.25 sn/işistek + 4 worker = ~3 sayfa/sn. Kütüphaneye nezaket.
- **OCR:** 2014 vintage, Türkçe diakritikleri çöp. Fuzzy regex bunu telafi eder ama %100 değildir.
- **Kapsam:** 19281942. **Kırım sürgünü (1944) bu arşivde YOK.**
Ayrıntı için: `SKILL.md` ve [vault haritalama notu](/home/salva/Obsidian/6-Geopolitics/Russia/03.%20HISTORICAL%20CONTEXT/EKOS-Gazete-Arsivi-Haritalama.md).

View File

@@ -0,0 +1,169 @@
---
name: ekos-gazete-search
description: "İstanbul Üniversitesi EKOS gazete arşivinde (1928-1942, 53 gazete, 581k sayfa OCR'lı) konu-bazlı sistematik arama. Türkçe-OCR-toleranslı fuzzy regex, öncelikli zaman pencereleri, Obsidian raporu üretimi. Kırım, Filistin, Holodomor, herhangi bir konu için parametrik."
domain: intelligence
subdomain: archival-research
tags:
- archive
- foia
- ottoman-press
- turkish-press
- historical-research
- ocr
- pdf
- newspaper
- early-republic
- crimea
- kirim
- diaspora
personas:
- scribe
- scholar
- oracle
- frodo
---
# EKOS Gazete Arama — Skill
## Ne zaman çağırılır?
Kullanıcı şunlardan birini söylediğinde:
- "EKOS arşivinde X tara/ara"
- "İstanbul Üniversitesi gazete arşivinde X haberlerini bul"
- "1928-1942 Türk basınında X"
- "nek.istanbul.edu.tr gazetelerinde tarama"
- Var olan keyword set'i (Kırım, Filistin, vb.) ile yeniden çalıştır
## Mimari özet
```
SLUG (53 gazete) → manifest.csv → fuzzy search → hits.jsonl → Obsidian raporu
```
3 aşama, üç ayrı script:
1. **`scripts/01_build_manifest.py`** — 53 gazete sayfasını çekip tüm PDF URL'lerini `manifests/ekos_master.csv`'ye yazar. Bir kez çalıştırılır, cache'lenir.
2. **`scripts/02_search_pdfs.py`** — manifest üzerinden iterate; her PDF'i indir, `pdftotext` ile metni çıkar, fuzzy regex'le ara, hit'leri `hits/<topic>.jsonl`'ye yaz, PDF'i sil.
3. **`scripts/03_render_report.py`** — JSONL'yi `6-Geopolitics/Russia/03. HISTORICAL CONTEXT/` altına master + yıllık raporlar olarak markdown'a render eder.
## Önkoşullar
```bash
# Sistem paketleri (Kali Linux'ta zaten var olabilir)
which pdftotext pdfinfo curl # poppler-utils
# Python venv (CLAUDE.md kuralı: sisteme değil venv'e kur)
cd /home/salva/.claude/skills/ekos-gazete-search
python3 -m venv .venv
source .venv/bin/activate
pip install requests pyyaml beautifulsoup4
```
## Tipik kullanım akışı
### A) İlk çalıştırma — manifest oluştur
```bash
cd /home/salva/.claude/skills/ekos-gazete-search
source .venv/bin/activate
python scripts/01_build_manifest.py
# → manifests/ekos_master.csv (~tek seferlik, ~5 dk)
```
### B) Arama — Kırım için, öncelikli pencerelerden başlayarak
```bash
# Strateji B: 1932-33, 1936-37, 1941-42 önce
python scripts/02_search_pdfs.py \
--keywords keywords/kirim.yaml \
--priority-only \
--workers 4 \
--out hits/kirim.jsonl
# Sonra geri kalan tüm yıllar
python scripts/02_search_pdfs.py \
--keywords keywords/kirim.yaml \
--workers 4 \
--out hits/kirim.jsonl
```
### C) POC modu — sadece 5 ana gazete, az veri ile test
```bash
python scripts/02_search_pdfs.py \
--keywords keywords/kirim.yaml \
--slug cumhuriyet \
--year-from 1932 --year-to 1933 \
--limit 50 \
--out hits/kirim_poc.jsonl
```
### D) Raporu render et
```bash
python scripts/03_render_report.py \
--hits hits/kirim.jsonl \
--topic Kirim \
--keywords keywords/kirim.yaml
# → 6-Geopolitics/Russia/03. HISTORICAL CONTEXT/EKOS-Kirim-Bulgular.md (master)
# → EKOS-Kirim-1932.md, EKOS-Kirim-1933.md, ... (yıllık)
```
## Yeni konu eklemek
1. `keywords/<topic>.yaml` oluştur — `keywords/_template.yaml`'ı şablon olarak kullan.
2. Wordlist'i doldur: `canonical`, `aliases`, `proper_nouns` (kişi adları), `disambiguators` (false positive filtreleri).
3. `priority_windows` tanımla — konunun yoğunlaştığı yıllar.
4. Çalıştır: `python scripts/02_search_pdfs.py --keywords keywords/<topic>.yaml --out hits/<topic>.jsonl`
## OCR Toleransı
PDF'lerin OCR'ı 2014 vintage, kalitesi orta-düşük. Türkçe diakritikleri sistematik olarak bozulmuş:
| Doğru | OCR'da | Regex class |
|---|---|---|
| `ı` | `1, i, l, |` | `[1iIıİlj|]` |
| `ş` | `~, s` | `[s~ş]` |
| `ç` | `c` | `[cç]` |
| `ğ` | `g` | `[gğ]` |
| `ü` | `u, ii` | `(?:[uü]|ii)` |
| `ö` | `o` | `[oö]` |
`scripts/lib/fuzzy.py` bu mapping'i otomatik uygular: `build_pattern("Kırım")``r"K[1iIıİlj|][rR][1iIıİlj|]m"`.
## Sınırlar ve uyarılar
- **Yunanca/Ermenice gazeteler** (apoyevmatini, aravelk, jamanak, metapolitefsis): OCR'ları henüz test edilmedi. İlk taramada Latin transkripsiyon aliases üzerinden tarayacak. Yetersizse ileride Tesseract `ell`/`hye` ile re-OCR eklenir.
- **Throttle:** 0.25 sn/istek. 581k sayfa tüm arşiv için 4 worker × ~12-18 saat. Kütüphaneye nezaket.
- **False positive:** "Kerim" (özel ad) ↔ "Kırım", "Kefe" (ilçe) ↔ "kefil/kefe" çakışması olur. Hit listesini gözden geçirirken `disambiguators` listesini büyüt.
- **Telif:** 1928-1942 PDF'ler kütüphane tarafından dağıtılıyor; biz sadece arama yapıp URL referansı kaydediyoruz, kalıcı kopya almıyoruz. Yasal sorun yok.
## Çıktı şeması (`hits/*.jsonl`)
Her satır bir hit:
```json
{
"slug": "cumhuriyet",
"year": "1933",
"month": "subat",
"day": "12",
"page": 3,
"keyword": "Kırım",
"match": "K1r1m",
"snippet": "...lan acl1k haberlerine gore K1r1m'da binlerce..." ,
"url": "https://nek.istanbul.edu.tr/.../cumhuriyet_1933_subat_12_.pdf"
}
```
## Persona ile entegrasyon
Bu skill, `persona-scribe-salva` (FOIA arşivci) personasının el aletidir. Scribe persona, arşiv-tarama görevi aldığında bu skill'i çağırır. Diğer alakalı personalar:
- `persona-frodo-russia` — Sovyet/Rus dönem analizi için hit'leri yorumlar
- `persona-centurion` — Askeri/savaş haberleri (1941-42 Doğu Cephesi)
- `persona-polyglot-russian` — Yunanca/Ermenice gazeteler aktive olduğunda
## Bilinen geliştirme alanları
- [ ] Yunanca/Ermenice OCR re-pass (Tesseract 5)
- [ ] Hit-level Tesseract doğrulaması (yanlış pozitif azaltma)
- [ ] Dataview view'ı (Obsidian'da hit listesi sortlanabilir)
- [ ] Kütüphaneye yazılı bilgi notu (büyük tarama öncesi)

View File

@@ -0,0 +1,38 @@
# EKOS Gazete Arama — Keyword Set Template
# Yeni konu eklerken bu dosyayı kopyala: cp _template.yaml <topic>.yaml
#
# Şema açıklaması:
# - canonical: aramada görüntülenecek "doğru" yazım (raporda bu görünür)
# - aliases: aynı kavramın diğer yazımları (transliterasyon, eski Türkçe, yabancı dil)
# - suffixes: opsiyonel — Türkçe ek toleransı (Kırım+lı, Kırım+da, ...)
# - weight: hit önemi (1=zayıf sinyal, 5=smoking gun). Rapor sıralaması bunu kullanır.
# - notes: bağlam (raporda görünmez)
topic: example
description: "Konu kısa açıklaması — raporun başlığında görünür"
# 1. Ana terimler — geniş, kavram seviyesi
keywords:
- canonical: "Örnek"
aliases: ["Example", "Beispiel"]
weight: 3
notes: "Genel terim"
# 2. Özel isimler — kişiler, yerler (smoking gun)
proper_nouns:
- canonical: "Mustafa Kemal"
aliases: ["Gazi", "Atatürk"]
weight: 5
# 3. Disambiguator — false positive filtre
# Eğer match'in ±20 karakter çevresinde bu terimler varsa hit reddedilir
disambiguators:
- "Kerim Bey" # "Kırım" ile karışan özel ad
- "Kerime Hanım"
# 4. Öncelikli zaman pencereleri — bu yıllarda hit'ler önce taranır + raporda öne çıkar
priority_windows:
- start: "1932-01-01"
end: "1933-12-31"
reason: "Açıklama"
weight: 5

View File

@@ -0,0 +1,286 @@
# EKOS Gazete Arama — Kırım Keyword Set
# Kapsam: Kırım Hanlığı dönemi mirasından 1942'ye kadar Türk basınında
# Kırım coğrafyası, halkı, diasporası, Sovyet dönemi, ve siyasi figürleri.
topic: Kirim
description: "Kırım — Hanlık mirası, Tatar halkı, diaspora, Sovyet dönemi (1928-1942)"
# ═══════════════════════════════════════════════════════════════════════
# 1. ANA TERİMLER — coğrafya ve kavram (geniş, weight 3-4)
# ═══════════════════════════════════════════════════════════════════════
keywords:
# Coğrafi temel
- canonical: "Kırım"
aliases: ["Crimea", "Krim", "Krym", "Krymea", "Crimee", "La Crimee"]
suffixes: ["lı", "lılar", "dan", "ya", "a", "da", "de", "i", "ın", "ı"]
weight: 4
notes: "Ana terim. OCR'da K1r1m, Kirim varyantları hakim."
- canonical: "Kırım Hanlığı"
aliases: ["Khanate of Crimea", "Crimean Khanate"]
weight: 5
notes: "Tarihsel devlet (1441-1783). Hanlık nostaljisi 1930'larda diaspora söyleminde aktif."
- canonical: "Kırım Yarımadası"
aliases: ["Crimean Peninsula", "Tauride"]
weight: 4
- canonical: "Kırım Türkleri"
aliases: ["Kırım Tatarları", "Crimean Tatars", "Krimtataren", "Tatars de Crimee"]
weight: 5
notes: "Diaspora söyleminde 'Türk' kelimesi 'Tatar' yerine sık kullanıldı"
- canonical: "Tatar"
aliases: ["Tatarlar", "Tatarların", "Tatare", "Tartar"]
weight: 2
notes: "WEIGHT DÜŞÜK — çok geniş hit verecek (Kazan Tatarı, Sibirya Tatarı vs). Disambiguator gerekir."
# Kırım şehirleri ve coğrafi noktalar
- canonical: "Bahçesaray"
aliases: ["Bahçe-saray", "Bagcesaray", "Bachtschisaraj", "Bakhchisaray", "Bakhchysaray"]
weight: 5
notes: "Hanlık başkenti — geçtiyse %100 Kırım bağlamı"
- canonical: "Akmescit"
aliases: ["Ak-mescit", "Akmesçit", "Simferopol", "Симферополь", "Simferopole"]
weight: 5
- canonical: "Kefe"
aliases: ["Caffa", "Theodosia", "Feodosiya", "Feodosia", "Theodosie"]
weight: 5
notes: "Eski Ceneviz/Osmanlı liman şehri. 'kefil/kefa' ile çakışmaya dikkat — disambiguator zorunlu."
- canonical: "Gözleve"
aliases: ["Yevpatoriya", "Eupatoria", "Yevpatoria"]
weight: 5
- canonical: "Sivastopol"
aliases: ["Sebastopol", "Sevastopol", "Sevastopolj", "Sevastopole"]
weight: 4
notes: "1854-55 Kırım Savaşı'nda meşhur, 1942'de Alman kuşatması"
- canonical: "Kerç"
aliases: ["Kertsch", "Kerch", "Керчь", "Kerč"]
weight: 4
notes: "1941-42 Doğu Cephesi'nde stratejik"
- canonical: "Yalta"
aliases: ["Jalta", "Ялта"]
weight: 4
- canonical: "Çatırdağ"
aliases: ["Çatır Dağı", "Chatyr-Dag", "Tschatyr-Dag"]
weight: 5
notes: "Kırım Tatar şiir/hatıra geleneğinde sembol — diaspora yazılarının imzası"
- canonical: "Or Kapı"
aliases: ["Orkapı", "Perekop", "Перекоп"]
weight: 5
notes: "Kırım'a giriş kapısı; askeri haberlerin merkezi (1920 İç Savaş, 1941-42)"
- canonical: "Karasubazar"
aliases: ["Karasu Bazar", "Karasubazaar", "Belogorsk"]
weight: 5
- canonical: "Kezlev"
aliases: ["Yevpatoria", "Kozlov"]
weight: 4
# Tarihsel / siyasi kavramlar
- canonical: "Kırım Muhtar Cumhuriyeti"
aliases: ["Crimean ASSR", "Krimskaja ASSR", "Кримська АРСР", "Crimean Autonomous"]
weight: 5
notes: "1921'de kurulan Sovyet özerk cumhuriyeti — 1928-1942 arası tüm Kırım haberinin idari bağlamı"
- canonical: "Milli Fırka"
aliases: ["Millî Fırka", "Milli Firka", "Kırım Milli Fırkası"]
weight: 5
notes: "Numan Çelebi Cihan'ın partisi — diaspora yazılarında smoking gun"
- canonical: "Kurultay"
aliases: ["Kırım Kurultayı"]
weight: 4
notes: "1917 Kurultay'ı, Kazan Kurultay'ı ile karışabilir — bağlam denetimi gerekli"
- canonical: "muhacir"
aliases: ["muhacirin", "muhacirler", "mültecilik", "mülteci"]
weight: 2
notes: "Genel terim ama Kırım göçü konusunda yoğun. Düşük weight + bağlam."
# Kırım Savaşı (tarihsel referans olarak gazetelerde geçer)
- canonical: "Kırım Savaşı"
aliases: ["Kırım Harbi", "Crimean War", "Krimkrieg", "Guerre de Crimee"]
weight: 4
notes: "1853-56. Tarihsel makaleler 1928-1942 boyunca düzenli."
# Sovyet dönem terminoloji
- canonical: "kollektivizasyon"
aliases: ["kollektifleştirme", "kolhoz", "sovhoz", "kolxoz"]
weight: 2
notes: "Geniş Sovyet bağlamı; Kırım haberleriyle birlikte gelirse weight artar"
- canonical: "açlık"
aliases: ["kıtlık", "ac11k", "kit11k"]
weight: 1
notes: "Çok geniş — sadece Kırım/Sovyet ile yakınsa anlamlı"
# ═══════════════════════════════════════════════════════════════════════
# 2. ÖZEL İSİMLER — kişiler (smoking gun, weight 5)
# Bir gazete sayfasında bu isimlerden biri geçtiyse Kırım içeriği %95 garanti.
# ═══════════════════════════════════════════════════════════════════════
proper_nouns:
# Kırım Tatar siyasi liderleri
- canonical: "Numan Çelebi Cihan"
aliases: ["Noman Çelebicihan", "Numan Çelebicihan", "Çelebi Cihan", "Celebi Cihan"]
weight: 5
notes: "Kırım Müslüman Demokratik Cumhuriyeti kurucusu (1917), Bolşeviklerce öldürüldü 1918"
- canonical: "Cafer Seydahmet"
aliases: ["Cafer Seyit Ahmet", "Cafer Seydamet", "Seydahmet Kırımer", "Cafer Kırımer", "Cafer Seyid Ahmet"]
weight: 5
notes: "İstanbul'da Kırım diasporasının lideri; 1928-1942 arası aktif yazar"
- canonical: "Müstecip Ülküsal"
aliases: ["Mustecip Ulkusal", "Müstecip Hacı Fazıl", "Ülküsal"]
weight: 5
notes: "Romanya/Köstence merkezli Kırım Tatar lideri, 'Emel' dergisi"
- canonical: "Hamdullah Suphi"
aliases: ["Hamdullah Suphi Tanrıöver", "Tanrıöver"]
weight: 4
notes: "Türk Ocakları reisi, Kırım/Romanya muhaceretiyle ilgili devlet adamı"
- canonical: "Yusuf Akçura"
aliases: ["Yusuf Akçuraoğlu", "Akçura", "Akcura"]
weight: 4
notes: "Kazan Tatarı ama Türkçü/Tatar dünyasının ortak figürü"
- canonical: "İsmail Gaspıralı"
aliases: ["İsmail Bey Gaspıralı", "Gasprinski", "Gasprinsky", "Ismail Gaspirali"]
weight: 5
notes: "Tercüman gazetesi yayıncısı, Türkçülüğün babası — anma yazıları sık"
- canonical: "Veli İbrahim"
aliases: ["Veli Ibraimov", "Veli Ibrahim"]
weight: 5
notes: "Kırım Muhtar Cumhuriyeti başkanı, 1928'de Stalin tarafından idam"
- canonical: "Bekir Çobanzade"
aliases: ["Bekir Çoban-zade", "Çobanzade", "Cobanzade"]
weight: 5
notes: "Kırım Tatar dilbilimci, 1937'de tasfiye edildi"
- canonical: "Mehmet Niyazi"
aliases: ["Memet Niyazi", "Mehmed Niyazi"]
weight: 4
notes: "Romanya/Köstence Kırım Tatar şairi"
- canonical: "Habibullah Kerimi"
aliases: ["Habibullah Karimi", "Kerimi"]
weight: 4
- canonical: "Asan Sabri Ayvaz"
aliases: ["Asan Sabri Ayvazov", "Ayvazov", "Sabri Ayvazov"]
weight: 5
notes: "Kırım Tatar yazar, 1937'de Stalin terörü kurbanı"
- canonical: "Reşit Mediyev"
aliases: ["Reşit Mediev", "Mediyev", "Medief"]
weight: 5
# Sovyet/Rus tarafı (Kırım'la doğrudan iş tutmuş)
- canonical: "Stalin"
aliases: ["Staline", "Сталин"]
weight: 1
notes: "Çok geniş; sadece Kırım/Tatar ile co-occurring olduğunda anlamlı"
# ═══════════════════════════════════════════════════════════════════════
# 3. DİSAMBİGÜATÖRLER — false positive filtreleri
# Bir match'in ±50 karakter çevresinde bu kelime varsa hit reddedilir
# ═══════════════════════════════════════════════════════════════════════
disambiguators:
# "Kırım" ↔ "Kerim" (özel isim) çakışması
- "Kerim Bey"
- "Kerim Pa~a" # Kerim Paşa OCR
- "Kerim Pasa"
- "Kerime Hanim"
- "Kerime Han1m"
- "Kerim Efendi"
- "Abdulkerim"
- "Abdiilkerim" # OCR varyantı
# "Kefe" (Crimea) ↔ "kefil/kefe" (sigorta/teminat)
- "kefil"
- "kefalet"
- "kefaleten"
# "Tatar" yiyecekler
- "tatar boregi"
- "tatar boregi"
- "tatar pidesi"
- "tatar sosu"
# "Yalta" ↔ Türkçe "yalta" yok; "yaltak" var
- "yaltakl"
- "yaltaklan"
# 1932 Türk Dili Kurultayı / Türk Tarih Kurultayı false positive'leri (POC iter-1 öğrendik)
# "Kurultay" tek başına Kırım için yetersiz; bu kombinler Atatürk dönemi reformları
- "Türk Dili Kurultayı"
- "Türk Dili Kurultay"
- "Tiirk Dili Kurultayi" # OCR varyantı
- "Dil Kurultayı"
- "Dil Kurultay"
- "Türk Tarih Kurultayı"
- "Türk Tarih Kurultay"
- "Tarih Kurultay"
- "tarih kurultay"
- "Halkevi Kurultay"
- "halkevleri kurultay"
- "C.H.F. Kurultay" # Cumhuriyet Halk Fırkası Kurultayı
- "C.H.P. Kurultay"
- "Fırka Kurultay"
- "Parti Kurultay"
# Kefe varyantları (genel "kıfayet/kifaye" OCR çöplüğü)
- "kifayet"
- "kifayetli"
# ═══════════════════════════════════════════════════════════════════════
# 4. ÖNCELİKLİ ZAMAN PENCERELERİ
# Bu pencerelerdeki sayılar önce taranır, raporda öne çıkar
# ═══════════════════════════════════════════════════════════════════════
priority_windows:
- start: "1928-01-01"
end: "1928-12-31"
weight: 4
reason: "Veli İbrahim idamı + Kırım Tatar tasfiyesinin başlangıcı"
- start: "1932-01-01"
end: "1933-12-31"
weight: 5
reason: "Holodomor / Kırım açlığı — Sovyet kıtlığının zirvesi"
- start: "1936-01-01"
end: "1938-06-30"
weight: 5
reason: "Stalin Büyük Terör — Çobanzade, Ayvazov, Bekirov tasfiyeleri"
- start: "1939-08-23"
end: "1941-06-22"
weight: 4
reason: "Molotov-Ribbentrop dönemi; Sovyet politikasında diaspora söylemi"
- start: "1941-06-22"
end: "1942-12-31"
weight: 5
reason: "Alman Doğu Cephesi ilerleyişi; Kırım'ın Wehrmacht tarafından işgali"
# ═══════════════════════════════════════════════════════════════════════
# 5. CO-OCCURRENCE BOOST — birlikte geçerse hit ağırlığı artar
# (lib/fuzzy.py içinde proximity score için)
# ═══════════════════════════════════════════════════════════════════════
co_occurrence_boost:
# Bu çiftler aynı paragrafta (±300 char) geçerse weight +2
- ["Kırım", "Tatar"]
- ["Kırım", "muhacir"]
- ["Sovyet", "Kırım"]
- ["Kırım", "açlık"]
- ["Kırım", "kollektivizasyon"]
- ["Tatar", "Bahçesaray"]
- ["Stalin", "Kırım"]

View File

@@ -0,0 +1,297 @@
topic: KirimCore
description: Kırım — sadece toponym ve Kırım-prefix kavramlar (dar tarama)
keywords:
- canonical: Kırım
aliases:
- Crimea
- Krim
- Krym
- Krymea
- Crimee
- La Crimee
suffixes:
- lı
- lılar
- dan
- ya
- a
- da
- de
- i
- ın
- ı
weight: 4
notes: Ana terim. OCR'da K1r1m, Kirim varyantları hakim.
- canonical: Kırım Hanlığı
aliases:
- Khanate of Crimea
- Crimean Khanate
weight: 5
notes: Tarihsel devlet (1441-1783). Hanlık nostaljisi 1930'larda diaspora söyleminde aktif.
- canonical: Kırım Yarımadası
aliases:
- Crimean Peninsula
- Tauride
weight: 4
- canonical: Kırım Türkleri
aliases:
- Kırım Tatarları
- Crimean Tatars
- Krimtataren
- Tatars de Crimee
weight: 5
notes: Diaspora söyleminde 'Türk' kelimesi 'Tatar' yerine sık kullanıldı
- canonical: Bahçesaray
aliases:
- Bahçe-saray
- Bagcesaray
- Bachtschisaraj
- Bakhchisaray
- Bakhchysaray
weight: 5
notes: Hanlık başkenti — geçtiyse %100 Kırım bağlamı
- canonical: Akmescit
aliases:
- Ak-mescit
- Akmesçit
- Simferopol
- Симферополь
- Simferopole
weight: 5
- canonical: Kefe
aliases:
- Caffa
- Theodosia
- Feodosiya
- Feodosia
- Theodosie
weight: 5
notes: Eski Ceneviz/Osmanlı liman şehri. 'kefil/kefa' ile çakışmaya dikkat — disambiguator zorunlu.
- canonical: Gözleve
aliases:
- Yevpatoriya
- Eupatoria
- Yevpatoria
weight: 5
- canonical: Sivastopol
aliases:
- Sebastopol
- Sevastopol
- Sevastopolj
- Sevastopole
weight: 4
notes: 1854-55 Kırım Savaşı'nda meşhur, 1942'de Alman kuşatması
- canonical: Kerç
aliases:
- Kertsch
- Kerch
- Керчь
- Kerč
weight: 4
notes: 1941-42 Doğu Cephesi'nde stratejik
- canonical: Yalta
aliases:
- Jalta
- Ялта
weight: 4
- canonical: Çatırdağ
aliases:
- Çatır Dağı
- Chatyr-Dag
- Tschatyr-Dag
weight: 5
notes: Kırım Tatar şiir/hatıra geleneğinde sembol — diaspora yazılarının imzası
- canonical: Or Kapı
aliases:
- Orkapı
- Perekop
- Перекоп
weight: 5
notes: Kırım'a giriş kapısı; askeri haberlerin merkezi (1920 İç Savaş, 1941-42)
- canonical: Karasubazar
aliases:
- Karasu Bazar
- Karasubazaar
- Belogorsk
weight: 5
- canonical: Kezlev
aliases:
- Yevpatoria
- Kozlov
weight: 4
- canonical: Kırım Muhtar Cumhuriyeti
aliases:
- Crimean ASSR
- Krimskaja ASSR
- Кримська АРСР
- Crimean Autonomous
weight: 5
notes: 1921'de kurulan Sovyet özerk cumhuriyeti — 1928-1942 arası tüm Kırım haberinin idari bağlamı
- canonical: Kırım Savaşı
aliases:
- Kırım Harbi
- Crimean War
- Krimkrieg
- Guerre de Crimee
weight: 4
notes: 1853-56. Tarihsel makaleler 1928-1942 boyunca düzenli.
proper_nouns:
- canonical: Numan Çelebi Cihan
aliases:
- Noman Çelebicihan
- Numan Çelebicihan
- Çelebi Cihan
- Celebi Cihan
weight: 5
notes: Kırım Müslüman Demokratik Cumhuriyeti kurucusu (1917), Bolşeviklerce öldürüldü 1918
- canonical: Cafer Seydahmet
aliases:
- Cafer Seyit Ahmet
- Cafer Seydamet
- Seydahmet Kırımer
- Cafer Kırımer
- Cafer Seyid Ahmet
weight: 5
notes: İstanbul'da Kırım diasporasının lideri; 1928-1942 arası aktif yazar
- canonical: Müstecip Ülküsal
aliases:
- Mustecip Ulkusal
- Müstecip Hacı Fazıl
- Ülküsal
weight: 5
notes: Romanya/Köstence merkezli Kırım Tatar lideri, 'Emel' dergisi
- canonical: Hamdullah Suphi
aliases:
- Hamdullah Suphi Tanrıöver
- Tanrıöver
weight: 4
notes: Türk Ocakları reisi, Kırım/Romanya muhaceretiyle ilgili devlet adamı
- canonical: Yusuf Akçura
aliases:
- Yusuf Akçuraoğlu
- Akçura
- Akcura
weight: 4
notes: Kazan Tatarı ama Türkçü/Tatar dünyasının ortak figürü
- canonical: İsmail Gaspıralı
aliases:
- İsmail Bey Gaspıralı
- Gasprinski
- Gasprinsky
- Ismail Gaspirali
weight: 5
notes: Tercüman gazetesi yayıncısı, Türkçülüğün babası — anma yazıları sık
- canonical: Veli İbrahim
aliases:
- Veli Ibraimov
- Veli Ibrahim
weight: 5
notes: Kırım Muhtar Cumhuriyeti başkanı, 1928'de Stalin tarafından idam
- canonical: Bekir Çobanzade
aliases:
- Bekir Çoban-zade
- Çobanzade
- Cobanzade
weight: 5
notes: Kırım Tatar dilbilimci, 1937'de tasfiye edildi
- canonical: Mehmet Niyazi
aliases:
- Memet Niyazi
- Mehmed Niyazi
weight: 4
notes: Romanya/Köstence Kırım Tatar şairi
- canonical: Habibullah Kerimi
aliases:
- Habibullah Karimi
- Kerimi
weight: 4
- canonical: Asan Sabri Ayvaz
aliases:
- Asan Sabri Ayvazov
- Ayvazov
- Sabri Ayvazov
weight: 5
notes: Kırım Tatar yazar, 1937'de Stalin terörü kurbanı
- canonical: Reşit Mediyev
aliases:
- Reşit Mediev
- Mediyev
- Medief
weight: 5
- canonical: Stalin
aliases:
- Staline
- Сталин
weight: 1
notes: Çok geniş; sadece Kırım/Tatar ile co-occurring olduğunda anlamlı
disambiguators:
- Kerim Bey
- Kerim Pa~a
- Kerim Pasa
- Kerime Hanim
- Kerime Han1m
- Kerim Efendi
- Abdulkerim
- Abdiilkerim
- kefil
- kefalet
- kefaleten
- tatar boregi
- tatar boregi
- tatar pidesi
- tatar sosu
- yaltakl
- yaltaklan
- Türk Dili Kurultayı
- Türk Dili Kurultay
- Tiirk Dili Kurultayi
- Dil Kurultayı
- Dil Kurultay
- Türk Tarih Kurultayı
- Türk Tarih Kurultay
- Tarih Kurultay
- tarih kurultay
- Halkevi Kurultay
- halkevleri kurultay
- C.H.F. Kurultay
- C.H.P. Kurultay
- Fırka Kurultay
- Parti Kurultay
- kifayet
- kifayetli
priority_windows:
- start: '1928-01-01'
end: '1928-12-31'
weight: 4
reason: Veli İbrahim idamı + Kırım Tatar tasfiyesinin başlangıcı
- start: '1932-01-01'
end: '1933-12-31'
weight: 5
reason: Holodomor / Kırım açlığı — Sovyet kıtlığının zirvesi
- start: '1936-01-01'
end: '1938-06-30'
weight: 5
reason: Stalin Büyük Terör — Çobanzade, Ayvazov, Bekirov tasfiyeleri
- start: '1939-08-23'
end: '1941-06-22'
weight: 4
reason: Molotov-Ribbentrop dönemi; Sovyet politikasında diaspora söylemi
- start: '1941-06-22'
end: '1942-12-31'
weight: 5
reason: Alman Doğu Cephesi ilerleyişi; Kırım'ın Wehrmacht tarafından işgali
co_occurrence_boost:
- - Kırım
- Tatar
- - Kırım
- muhacir
- - Sovyet
- Kırım
- - Kırım
- açlık
- - Kırım
- kollektivizasyon
- - Tatar
- Bahçesaray
- - Stalin
- Kırım

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
Build a master manifest of every PDF in the EKOS gazette archive.
Fetches each gazette.php?gazete=<slug> page once, extracts all PDF
hrefs, and writes them into manifests/ekos_master.csv. This is a
one-time operation (~5 minutes); the resulting CSV drives subsequent
search runs.
"""
import csv
import re
import sys
import time
from pathlib import Path
import requests
BASE = "https://nek.istanbul.edu.tr/ekos/GAZETE/"
HERE = Path(__file__).resolve().parent.parent
# 53 newspaper slugs discovered during recon (2026-04-28)
SLUGS = [
"aciksoz", "aksam", "anadolu", "apoyevmatini", "aravelk", "aydin",
"beyoglu", "borsa", "bugun", "cerideihavadis", "cumhuriyet", "dogu",
"ensondakika", "ensonhavadis", "haber", "hakikat", "hakimiyetimilliye",
"hakkinsesi", "halkindili", "halkinsesi", "ikdam", "ikdamhalk",
"ikdamsabahpostasi", "istanbul", "izmirpostasi", "jamanak", "kurun",
"leechodebelgrade", "metapolitefsis", "milliyet", "munakasa",
"piyasacetveli", "savas", "sondakika", "sonposta", "sonsaat",
"sontelgraf", "tan", "tasviriefkar", "turkdili", "turkischepost",
"turksozu", "ulus", "ulusalbirlik", "ulussesi", "vakit", "vatan",
"yarin", "yeniasir", "yenigun", "yenimersin", "yenisabah", "yeniyol",
]
# /<slug>/<slug>_<year>/<slug>_<year>_<month>_/<slug>_<year>_<month>_<day>_.pdf
PDF_HREF_RE = re.compile(r'href="([^"]+\.pdf)"', re.IGNORECASE)
DATE_RE = re.compile(
r'/([a-z][a-z0-9]+)_(\d{4})_([a-z]+?)_(\d+)_?\.pdf',
re.IGNORECASE
)
UA = {"User-Agent": "Mozilla/5.0 (research; ekos-gazete-search; "
"contact: kutuphane@istanbul.edu.tr)"}
def normalize_url(href: str) -> str:
if href.startswith("http"):
return href
if href.startswith("/"):
return "https://nek.istanbul.edu.tr" + href
# remove leading "../" or "./"
href = re.sub(r'^\.+/', '', href)
return BASE + href
def fetch_slug(slug: str, throttle: float = 1.0):
url = f"{BASE}gazete.php?gazete={slug}"
print(f"{slug}", end=" ", flush=True)
try:
r = requests.get(url, headers=UA, timeout=30)
r.raise_for_status()
except Exception as e:
print(f"FAIL: {e}")
return []
rows = []
for href in PDF_HREF_RE.findall(r.text):
m = DATE_RE.search(href)
if not m:
continue
s, year, month, day = m.groups()
rows.append({
"slug": s.lower(),
"year": year,
"month": month.lower(),
"day": day,
"url": normalize_url(href),
})
print(f"{len(rows)} PDFs")
time.sleep(throttle)
return rows
def main():
out_path = HERE / "manifests" / "ekos_master.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Fetching {len(SLUGS)} gazette pages → {out_path}")
print(f"Throttle: 1s/req, expected runtime ~1 minute")
print()
all_rows = []
for slug in SLUGS:
all_rows.extend(fetch_slug(slug))
# De-dup (the catalogs occasionally repeat hrefs)
seen = set()
deduped = []
for r in all_rows:
key = r["url"]
if key in seen:
continue
seen.add(key)
deduped.append(r)
with out_path.open("w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["slug", "year", "month", "day", "url"])
w.writeheader()
w.writerows(deduped)
print(f"\n✓ Manifest: {len(deduped)} unique PDFs → {out_path}")
# Quick stats
by_slug = {}
for r in deduped:
by_slug[r["slug"]] = by_slug.get(r["slug"], 0) + 1
print(f"\nTop 10 by issue count:")
for s, c in sorted(by_slug.items(), key=lambda x: -x[1])[:10]:
print(f" {s:>25} {c:>5}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
Iterate the EKOS manifest, download each PDF, extract its text-layer,
fuzzy-search against a YAML keyword set, and append hits to JSONL.
Storage policy:
- Default: PDFs go to /tmp/ekos-cache/, processed, DELETED.
- With --keep-pdfs DIR: PDFs that produce >=1 hit are MOVED to
DIR/<slug>/<year>/<slug>_<year>_<month>_<day>.pdf for re-use.
PDFs with zero hits are still deleted (content-driven curation).
"""
import argparse
import csv
import json
import os
import random
import re
import shutil
import subprocess
import sys
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, FIRST_COMPLETED, wait
from pathlib import Path
import requests
import yaml
HERE = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(HERE / "scripts"))
from lib.fuzzy import (
compile_keyword_set, compile_disambiguators,
is_false_positive, extract_snippet
)
UA = {"User-Agent": "Mozilla/5.0 (research; ekos-gazete-search)"}
DEFAULT_CACHE = Path(os.environ.get("EKOS_CACHE", "/tmp/ekos-cache"))
def in_priority_window(year: int, month: str, day: str, windows: list):
"""Return (in_window: bool, weight: int, reason: str)."""
# Map Turkish month slugs to numbers for comparison
month_map = {
"ocak": 1, "subat": 2, "şubat": 2, "mart": 3, "nisan": 4, "mayis": 5,
"mayıs": 5, "haziran": 6, "temmuz": 7, "agustos": 8, "ağustos": 8,
"eylul": 9, "eylül": 9, "ekim": 10, "kasim": 11, "kasım": 11,
"aralik": 12, "aralık": 12,
}
try:
m = month_map.get(month.lower(), 1)
d = int(day)
from datetime import date
cur = date(int(year), m, d)
except Exception:
return False, 0, None
for w in windows:
from datetime import datetime
try:
s = datetime.strptime(w["start"], "%Y-%m-%d").date()
e = datetime.strptime(w["end"], "%Y-%m-%d").date()
if s <= cur <= e:
return True, w.get("weight", 3), w.get("reason", "")
except Exception:
continue
return False, 0, None
def pdftotext_page(pdf_path: Path, page: int, timeout: int = 30) -> str:
"""Extract text from a single page using poppler-utils."""
try:
r = subprocess.run(
["pdftotext", "-layout", "-f", str(page), "-l", str(page),
str(pdf_path), "-"],
capture_output=True, text=True, timeout=timeout, errors="replace"
)
return r.stdout
except subprocess.TimeoutExpired:
return ""
def get_page_count(pdf_path: Path) -> int:
try:
r = subprocess.run(["pdfinfo", str(pdf_path)],
capture_output=True, text=True, timeout=15)
m = re.search(r"Pages:\s+(\d+)", r.stdout)
return int(m.group(1)) if m else 1
except Exception:
return 1
def process_pdf(row: dict, patterns: list, disambiguators: list,
cache_dir: Path, priority_info: tuple,
keep_dir: Path | None = None) -> list:
"""Returns list of hit dicts (possibly empty).
If keep_dir is set and the PDF produces >=1 hit, the PDF is moved to
keep_dir/<slug>/<year>/<basename>.pdf. Zero-hit PDFs are always deleted.
"""
slug, year, month, day, url = (row["slug"], row["year"], row["month"],
row["day"], row["url"])
pdf_path = cache_dir / f"{slug}_{year}_{month}_{day}.pdf"
in_window, win_weight, win_reason = priority_info
# Download
try:
r = requests.get(url, headers=UA, timeout=120, stream=True)
if r.status_code != 200:
return []
with pdf_path.open("wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
except Exception:
return []
hits = []
try:
n_pages = get_page_count(pdf_path)
for page in range(1, n_pages + 1):
text = pdftotext_page(pdf_path, page)
if len(text) < 50:
continue
# Search every compiled pattern against this page
for label, weight, pat in patterns:
for m in pat.finditer(text):
if is_false_positive(text, m.start(), m.end(),
disambiguators, window=200):
continue
snippet = extract_snippet(text, m.start(), m.end(), 200)
final_weight = weight + (win_weight if in_window else 0)
hits.append({
"slug": slug, "year": year, "month": month,
"day": day, "page": page,
"keyword": label, "match": m.group(0),
"snippet": snippet, "url": url,
"weight": final_weight,
"priority_window": in_window,
"window_reason": win_reason if in_window else None,
})
except Exception as e:
print(f" [error] {slug} {year}/{month}/{day}: {e}", file=sys.stderr)
finally:
try:
if keep_dir is not None and hits:
target_dir = keep_dir / slug / str(year)
target_dir.mkdir(parents=True, exist_ok=True)
target_path = target_dir / pdf_path.name
shutil.move(str(pdf_path), str(target_path))
for h in hits:
h["local_pdf"] = str(target_path)
else:
pdf_path.unlink()
except Exception as e:
print(f" [retain error] {pdf_path}: {e}", file=sys.stderr)
return hits
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--manifest", default=str(HERE / "manifests/ekos_master.csv"))
ap.add_argument("--keywords", default=str(HERE / "keywords/kirim.yaml"))
ap.add_argument("--out", default=str(HERE / "hits/kirim.jsonl"))
ap.add_argument("--priority-only", action="store_true",
help="Only process issues inside priority_windows")
ap.add_argument("--year-from", type=int)
ap.add_argument("--year-to", type=int)
ap.add_argument("--slug", help="Restrict to gazette slug(s); comma-separated for multiple")
ap.add_argument("--workers", type=int, default=4)
ap.add_argument("--limit", type=int, help="Process at most N issues")
ap.add_argument("--throttle", type=float, default=0.25,
help="Seconds to sleep between job dispatches")
ap.add_argument("--cache", default=str(DEFAULT_CACHE))
ap.add_argument("--keep-pdfs", default=None,
help="Move hit-producing PDFs into DIR/<slug>/<year>/ "
"instead of deleting them. Zero-hit PDFs are still "
"deleted (content-driven curation).")
args = ap.parse_args()
keep_dir = Path(args.keep_pdfs) if args.keep_pdfs else None
if keep_dir:
keep_dir.mkdir(parents=True, exist_ok=True)
print(f"PDF retention: hit-only → {keep_dir}/<slug>/<year>/")
# Load keywords
with open(args.keywords) as f:
keyword_data = yaml.safe_load(f)
patterns = compile_keyword_set(keyword_data)
disambiguators = compile_disambiguators(keyword_data)
windows = keyword_data.get("priority_windows", [])
print(f"Compiled {len(patterns)} patterns, "
f"{len(disambiguators)} disambiguators, "
f"{len(windows)} priority windows")
cache_dir = Path(args.cache)
cache_dir.mkdir(parents=True, exist_ok=True)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
# Load + filter manifest
slug_filter = set(s.strip() for s in args.slug.split(",")) if args.slug else None
rows = []
with open(args.manifest) as f:
for r in csv.DictReader(f):
if slug_filter and r["slug"] not in slug_filter:
continue
try:
y = int(r["year"])
except ValueError:
continue
if args.year_from and y < args.year_from:
continue
if args.year_to and y > args.year_to:
continue
in_w, _, _ = in_priority_window(y, r["month"], r["day"], windows)
r["_in_window"] = in_w
if args.priority_only and not in_w:
continue
rows.append(r)
# Sort priority-window first
rows.sort(key=lambda r: (0 if r["_in_window"] else 1,
r["year"], r["month"], r["day"]))
if args.limit:
rows = rows[:args.limit]
print(f"Processing {len(rows)} issues "
f"({sum(1 for r in rows if r['_in_window'])} in priority windows)")
print(f"Workers: {args.workers}, throttle: {args.throttle}s")
print(f"Output: {out_path}")
print()
start = time.time()
n_hits = 0
n_done = 0
def submit_job(executor, row):
prio = in_priority_window(int(row["year"]), row["month"],
row["day"], windows)
return executor.submit(process_pdf, row, patterns, disambiguators,
cache_dir, prio, keep_dir)
with out_path.open("a", encoding="utf-8") as out_f, \
ThreadPoolExecutor(max_workers=args.workers) as ex:
# Interleaved submit+collect: keep ~workers*2 jobs in flight,
# flush hits & log progress as each future completes (crash-safe).
row_iter = iter(rows)
def submit_next():
try:
r = next(row_iter)
except StopIteration:
return None
if args.throttle > 0:
time.sleep(args.throttle)
return submit_job(ex, r)
in_flight = set()
for _ in range(args.workers * 2):
f = submit_next()
if f is None:
break
in_flight.add(f)
while in_flight:
done, in_flight = wait(in_flight, return_when=FIRST_COMPLETED)
for fut in done:
try:
hits = fut.result()
except Exception as e:
print(f" [worker error] {e}", file=sys.stderr)
hits = []
for h in hits:
out_f.write(json.dumps(h, ensure_ascii=False) + "\n")
if hits:
out_f.flush()
n_hits += len(hits)
n_done += 1
if n_done % 25 == 0:
rate = n_done / (time.time() - start)
eta = (len(rows) - n_done) / max(rate, 0.01)
print(f" [{n_done}/{len(rows)}] hits={n_hits} "
f"rate={rate:.1f}/s eta={eta/60:.1f}min",
flush=True)
f = submit_next()
if f is not None:
in_flight.add(f)
print(f"\n✓ Done in {(time.time()-start)/60:.1f}min: "
f"{n_done} issues processed, {n_hits} hits → {out_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
Render hits/<topic>.jsonl into Obsidian-friendly Markdown reports
under 6-Geopolitics/Russia/03. HISTORICAL CONTEXT/ .
Output:
EKOS-<Topic>-Bulgular.md — master, cross-year overview
EKOS-<Topic>-<YYYY>.md — per-year detailed list
"""
import argparse
import json
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
VAULT_BASE = Path("/home/salva/Obsidian/6-Geopolitics/Russia/03. HISTORICAL CONTEXT")
HERE = Path(__file__).resolve().parent.parent
MONTH_TR = {
"ocak": "Ocak", "subat": "Şubat", "mart": "Mart", "nisan": "Nisan",
"mayis": "Mayıs", "haziran": "Haziran", "temmuz": "Temmuz",
"agustos": "Ağustos", "eylul": "Eylül", "ekim": "Ekim",
"kasim": "Kasım", "aralik": "Aralık",
}
def fmt_date(year: str, month: str, day: str) -> str:
return f"{year}-{month}-{day:>02s}"
def load_hits(path: Path) -> list:
hits = []
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
hits.append(json.loads(line))
return hits
def write_master(path: Path, hits: list, topic: str):
by_year = defaultdict(list)
for h in hits:
by_year[h["year"]].append(h)
kw_counter = Counter(h["keyword"] for h in hits)
slug_counter = Counter(h["slug"] for h in hits)
priority_hits = [h for h in hits if h.get("priority_window")]
with path.open("w", encoding="utf-8") as f:
f.write(f"""---
up:: [[Russia - PDF Library Index]]
tag:: [[6.1-Geopolitical Analysis]]
created:: {datetime.now().strftime('%Y-%m-%d')}
topic:: {topic}
total_hits:: {len(hits)}
priority_hits:: {len(priority_hits)}
source:: EKOS - İstanbul Üniversitesi NEK
---
# EKOS — {topic} Bulguları (Master)
> **İstanbul Üniversitesi Nadir Eserler Kütüphanesi gazete arşivi (1928-1942)**
> Toplam **{len(hits)} hit** — bunların **{len(priority_hits)}** tanesi öncelikli zaman pencerelerinde.
> Tarama tarihi: {datetime.now().strftime('%Y-%m-%d')}
## Yıllara Göre Dağılım
| Yıl | Toplam Hit | Öncelik Hit | Yıllık Rapor |
|---|---:|---:|---|
""")
for year in sorted(by_year):
year_hits = by_year[year]
prio_count = sum(1 for h in year_hits if h.get("priority_window"))
link = f"EKOS-{topic}-{year}"
f.write(f"| {year} | {len(year_hits)} | {prio_count} | [[{link}]] |\n")
f.write("\n## En Sık Geçen Anahtar Terimler\n\n")
for kw, cnt in kw_counter.most_common(25):
f.write(f"- **{kw}** — {cnt}\n")
f.write("\n## En Verimli Gazeteler\n\n")
for slug, cnt in slug_counter.most_common(20):
f.write(f"- `{slug}` — {cnt}\n")
# Top weighted hits (most likely smoking guns)
f.write("\n## En Yüksek Skorlu 30 Hit (öncelikli inceleme)\n\n")
top = sorted(hits, key=lambda h: -h.get("weight", 0))[:30]
for h in top:
date_str = fmt_date(h["year"], h["month"], h["day"])
f.write(f"### {h['slug']}{date_str} — sayfa {h['page']}\n\n")
f.write(f"- **Kelime:** {h['keyword']} (match: `{h['match']}`)\n")
f.write(f"- **Skor:** {h.get('weight', 0)}")
if h.get("priority_window"):
f.write(f" _(öncelikli pencere: {h.get('window_reason', '')})_")
f.write(f"\n- **Kaynak:** [PDF]({h['url']})\n")
f.write(f"- **Bağlam:**\n > {h['snippet']}\n\n")
f.write(f"\n---\n_Otomatik üretildi: ekos-gazete-search skill, {datetime.now().strftime('%Y-%m-%d %H:%M')}_\n")
def write_yearly(path: Path, hits: list, year: str, topic: str, master_stem: str):
by_date = defaultdict(list)
for h in hits:
by_date[fmt_date(h["year"], h["month"], h["day"])].append(h)
kw_counter = Counter(h["keyword"] for h in hits)
priority_hits = [h for h in hits if h.get("priority_window")]
window_reasons = set(h.get("window_reason") for h in hits if h.get("priority_window"))
window_reasons.discard(None)
with path.open("w", encoding="utf-8") as f:
f.write(f"""---
up:: [[{master_stem}]]
tag:: [[6.1-Geopolitical Analysis]]
year:: {year}
topic:: {topic}
hit_count:: {len(hits)}
priority_hits:: {len(priority_hits)}
---
# EKOS — {topic} {year}
**Toplam hit:** {len(hits)}{f' — bunların {len(priority_hits)} tanesi öncelikli pencerede' if priority_hits else ''}.
""")
if window_reasons:
f.write("**Öncelikli pencereler bu yılda:**\n")
for r in window_reasons:
f.write(f"- {r}\n")
f.write("\n")
f.write("**Kelime dağılımı:** ")
f.write(", ".join(f"{k} ({v})" for k, v in kw_counter.most_common(10)))
f.write("\n\n---\n\n")
for date_str in sorted(by_date):
date_hits = sorted(by_date[date_str], key=lambda h: -h.get("weight", 0))
month_pretty = MONTH_TR.get(date_hits[0]["month"], date_hits[0]["month"])
f.write(f"## {date_str} _({month_pretty})_\n\n")
for h in date_hits:
f.write(f"### {h['slug']} — sayfa {h['page']} — `{h['keyword']}`\n\n")
f.write(f"> {h['snippet']}\n\n")
f.write(f"- Match: `{h['match']}` • Skor: {h.get('weight', 0)}")
if h.get("priority_window"):
f.write(" 🔥")
f.write(f"\n- [PDF]({h['url']})\n\n")
f.write(f"\n---\n_ekos-gazete-search, {datetime.now().strftime('%Y-%m-%d %H:%M')}_\n")
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--hits", default=str(HERE / "hits/kirim.jsonl"))
ap.add_argument("--topic", default="Kirim",
help="Used in filenames (e.g. Kirim → EKOS-Kirim-1932.md)")
ap.add_argument("--vault", default=str(VAULT_BASE),
help="Output base dir under vault")
args = ap.parse_args()
hits_path = Path(args.hits)
if not hits_path.exists() or hits_path.stat().st_size == 0:
print(f"[!] No hits at {hits_path} — run 02_search_pdfs.py first")
return
hits = load_hits(hits_path)
if not hits:
print(f"[!] hits file empty: {hits_path}")
return
print(f"Loaded {len(hits)} hits")
vault_dir = Path(args.vault)
vault_dir.mkdir(parents=True, exist_ok=True)
master_path = vault_dir / f"EKOS-{args.topic}-Bulgular.md"
write_master(master_path, hits, args.topic)
print(f" ✓ master → {master_path}")
by_year = defaultdict(list)
for h in hits:
by_year[h["year"]].append(h)
for year, year_hits in sorted(by_year.items()):
year_path = vault_dir / f"EKOS-{args.topic}-{year}.md"
write_yearly(year_path, year_hits, year, args.topic, master_path.stem)
print(f"{year} ({len(year_hits)} hit) → {year_path.name}")
print(f"\n✓ Reports rendered under {vault_dir}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Export hits/<topic>.jsonl into:
reports/EKOS-<Topic>.csv — flat, all hits, chronological
reports/EKOS-<Topic>-Rapor.docx — formatted Word report (TOC, top-30 smoking
guns, per-year sections with snippets)
Examples:
python scripts/04_export.py
python scripts/04_export.py --topic Kirim --out-dir /home/salva/Documents/EKOS-out
"""
import argparse
import csv
import json
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt, RGBColor, Cm
HERE = Path(__file__).resolve().parent.parent
MONTH_TR = {
"ocak": ("Ocak", 1), "subat": ("Şubat", 2), "şubat": ("Şubat", 2),
"mart": ("Mart", 3), "nisan": ("Nisan", 4), "mayis": ("Mayıs", 5),
"mayıs": ("Mayıs", 5), "haziran": ("Haziran", 6), "temmuz": ("Temmuz", 7),
"agustos": ("Ağustos", 8), "ağustos": ("Ağustos", 8),
"eylul": ("Eylül", 9), "eylül": ("Eylül", 9), "ekim": ("Ekim", 10),
"kasim": ("Kasım", 11), "kasım": ("Kasım", 11),
"aralik": ("Aralık", 12), "aralık": ("Aralık", 12),
"kanunusani": ("Ocak", 1), "kanunuevvel": ("Aralık", 12),
"tesrinievvel": ("Ekim", 10), "tesrinisani": ("Kasım", 11),
}
def date_key(h):
"""Sort key: (year, month_num, day)."""
m = MONTH_TR.get(h["month"].lower(), (h["month"], 99))[1]
try:
d = int(h["day"])
except Exception:
d = 99
return (int(h["year"]), m, d)
def load_hits(path):
hits = []
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
hits.append(json.loads(line))
return hits
def write_csv(path, hits):
"""All hits flat. Sort: chronological, then weight DESC within same date."""
fields = ["year", "month", "day", "slug", "page",
"keyword", "match", "weight",
"priority_window", "window_reason", "snippet", "url"]
sorted_hits = sorted(hits, key=lambda h: (date_key(h), -h.get("weight", 0)))
with path.open("w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
w.writeheader()
for h in sorted_hits:
row = dict(h)
row["snippet"] = (row.get("snippet") or "").replace("\n", " ").strip()
w.writerow(row)
return len(sorted_hits)
def _set_cell_bold(cell, bold=True):
for p in cell.paragraphs:
for r in p.runs:
r.bold = bold
def write_docx(path, hits, topic):
doc = Document()
# Margins
for s in doc.sections:
s.top_margin = Cm(2.0)
s.bottom_margin = Cm(2.0)
s.left_margin = Cm(2.0)
s.right_margin = Cm(2.0)
# Title
t = doc.add_heading(f"EKOS — {topic} Bulguları", level=0)
t.alignment = WD_ALIGN_PARAGRAPH.CENTER
sub = doc.add_paragraph()
sub.alignment = WD_ALIGN_PARAGRAPH.CENTER
r = sub.add_run("İstanbul Üniversitesi NEK Gazete Arşivi (1928-1942)")
r.italic = True; r.font.size = Pt(11)
# Stats overview
by_year = defaultdict(list)
for h in hits:
by_year[h["year"]].append(h)
kw_counter = Counter(h["keyword"] for h in hits)
slug_counter = Counter(h["slug"] for h in hits)
priority_hits = [h for h in hits if h.get("priority_window")]
doc.add_paragraph()
p = doc.add_paragraph()
p.add_run("Üretim tarihi: ").bold = True
p.add_run(datetime.now().strftime("%Y-%m-%d %H:%M"))
p = doc.add_paragraph()
p.add_run("Toplam vuruş: ").bold = True
p.add_run(f"{len(hits)} ")
p.add_run("Öncelikli pencere içinde: ").bold = True
p.add_run(f"{len(priority_hits)}")
p = doc.add_paragraph()
p.add_run("Yıl aralığı: ").bold = True
yrs = sorted(by_year)
p.add_run(f"{yrs[0]} {yrs[-1]} ")
p.add_run("Gazete sayısı: ").bold = True
p.add_run(f"{len(slug_counter)}")
# Yearly distribution table
doc.add_heading("Yıllara Göre Dağılım", level=1)
tbl = doc.add_table(rows=1, cols=3)
tbl.style = "Light Grid Accent 1"
hdr = tbl.rows[0].cells
hdr[0].text = "Yıl"; hdr[1].text = "Toplam"; hdr[2].text = "Öncelikli"
for c in hdr: _set_cell_bold(c, True)
for y in yrs:
row = tbl.add_row().cells
row[0].text = y
row[1].text = str(len(by_year[y]))
row[2].text = str(sum(1 for h in by_year[y] if h.get("priority_window")))
# Keyword distribution
doc.add_heading("Anahtar Kelime Dağılımı (top 20)", level=1)
tbl = doc.add_table(rows=1, cols=2)
tbl.style = "Light Grid Accent 1"
hdr = tbl.rows[0].cells
hdr[0].text = "Anahtar"; hdr[1].text = "Sayı"
for c in hdr: _set_cell_bold(c, True)
for kw, n in kw_counter.most_common(20):
row = tbl.add_row().cells
row[0].text = kw
row[1].text = str(n)
# Slug productivity
doc.add_heading("En Verimli Gazeteler", level=1)
tbl = doc.add_table(rows=1, cols=2)
tbl.style = "Light Grid Accent 1"
hdr = tbl.rows[0].cells
hdr[0].text = "Gazete"; hdr[1].text = "Vuruş"
for c in hdr: _set_cell_bold(c, True)
for slug, n in slug_counter.most_common(15):
row = tbl.add_row().cells
row[0].text = slug
row[1].text = str(n)
# Top scored hits
doc.add_page_break()
doc.add_heading("En Yüksek Skorlu 30 Vuruş", level=1)
doc.add_paragraph(
"Skorlama: temel kelime ağırlığı + öncelikli pencere bonusu. "
"Yüksek skorlu vuruşlar manuel okumada ilk öncelik."
).italic = True
top = sorted(hits, key=lambda h: -h.get("weight", 0))[:30]
for i, h in enumerate(top, 1):
m_pretty = MONTH_TR.get(h["month"].lower(), (h["month"], 0))[0]
head = doc.add_paragraph()
run = head.add_run(f"{i}. {h['slug']}{h['year']} {m_pretty} {h['day']} — s. {h['page']}")
run.bold = True; run.font.size = Pt(11)
meta = doc.add_paragraph()
meta.add_run("Anahtar: ").bold = True
meta.add_run(f"{h['keyword']} ")
meta.add_run("Eşleşme: ").bold = True
meta.add_run(f"{h['match']} ")
meta.add_run("Skor: ").bold = True
meta.add_run(f"{h.get('weight', 0)}")
if h.get("priority_window"):
wr = h.get("window_reason") or ""
run2 = meta.add_run(f" [öncelikli: {wr[:60]}]")
run2.italic = True
run2.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
sn = doc.add_paragraph()
sn.paragraph_format.left_indent = Cm(0.6)
sn_run = sn.add_run(h.get("snippet", ""))
sn_run.italic = True; sn_run.font.size = Pt(10)
url_p = doc.add_paragraph()
url_p.paragraph_format.left_indent = Cm(0.6)
url_run = url_p.add_run(f"PDF: {h['url']}")
url_run.font.size = Pt(8)
url_run.font.color.rgb = RGBColor(0x55, 0x55, 0x55)
# Per-year sections
for year in yrs:
doc.add_page_break()
year_hits = sorted(by_year[year],
key=lambda h: (date_key(h), -h.get("weight", 0)))
prio = sum(1 for h in year_hits if h.get("priority_window"))
doc.add_heading(f"{year} ({len(year_hits)} vuruş, {prio} öncelikli)",
level=1)
# Quick keyword summary for the year
yk = Counter(h["keyword"] for h in year_hits)
s = doc.add_paragraph()
s.add_run("Anahtar dağılımı: ").bold = True
s.add_run(", ".join(f"{k}({v})" for k, v in yk.most_common(8)))
# Group by date
by_date = defaultdict(list)
for h in year_hits:
key = (h["year"], h["month"], h["day"])
by_date[key].append(h)
for dk in sorted(by_date, key=lambda k: (
int(k[0]),
MONTH_TR.get(k[1].lower(), (k[1], 99))[1],
int(k[2]) if str(k[2]).isdigit() else 99)):
y, m, d = dk
m_pretty = MONTH_TR.get(m.lower(), (m, 0))[0]
doc.add_heading(f"{y} {m_pretty} {d}", level=3)
for h in by_date[dk]:
p = doc.add_paragraph()
p.add_run(f"{h['slug']} ").bold = True
p.add_run(f"s.{h['page']}")
kr = p.add_run(h["keyword"])
kr.bold = True
if h.get("priority_window"):
kr.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
p.add_run(f" (skor {h.get('weight', 0)})")
sn = doc.add_paragraph()
sn.paragraph_format.left_indent = Cm(0.5)
sr = sn.add_run(h.get("snippet", ""))
sr.italic = True; sr.font.size = Pt(9)
# Footer
doc.add_page_break()
f = doc.add_paragraph()
f.alignment = WD_ALIGN_PARAGRAPH.CENTER
fr = f.add_run(f"Otomatik üretildi: ekos-gazete-search skill, "
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}")
fr.italic = True; fr.font.size = Pt(9)
doc.save(str(path))
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--hits", default=str(HERE / "hits/kirim.jsonl"))
ap.add_argument("--topic", default="Kirim")
ap.add_argument("--out-dir", default=str(HERE / "reports"))
args = ap.parse_args()
hits_path = Path(args.hits)
if not hits_path.exists() or hits_path.stat().st_size == 0:
print(f"[!] No hits at {hits_path}")
return
hits = load_hits(hits_path)
print(f"Loaded {len(hits)} hits from {hits_path}")
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / f"EKOS-{args.topic}.csv"
docx_path = out_dir / f"EKOS-{args.topic}-Rapor.docx"
n = write_csv(csv_path, hits)
print(f" ✓ CSV ({n} rows) → {csv_path}")
write_docx(docx_path, hits, args.topic)
print(f" ✓ DOCX → {docx_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,173 @@
"""
OCR-tolerant fuzzy regex builder for Turkish text.
Strategy: 2014-vintage Turkish OCR systematically destroys diacritics.
Each character is replaced with a character class that covers all
plausible OCR misreadings. See: keywords/kirim.yaml notes column.
"""
import re
import unicodedata
# Character → tolerant character class mapping.
# Order matters in DIACRITIC_CLASSES: lookup is case-folded.
DIACRITIC_CLASSES = {
# The ı/i/I/İ family — the most damaged
'i': r'[1iIıİlj|!]',
'ı': r'[1iIıİlj|!]',
# Sibilants
's': r'[s$ş]',
'ş': r'[s$ş~]',
# Plosives
'c': r'[cç]',
'ç': r'[cç]',
'g': r'[gğ]',
'ğ': r'[gğq]',
# Vowels
'u': r'(?:[uü]|ii)',
'ü': r'(?:[uü]|ii)',
'o': r'[oö0]',
'ö': r'[oö0]',
'a': r'[aâå]',
'â': r'[aâå]',
'e': r'[eé]',
}
# Non-letter separators in OCR can be space, dash, underscore, tilde, dot.
WORD_SEP = r'[\s\-_~.,]+'
def turkish_lower(s: str) -> str:
"""Turkish-aware lowercase: İ→i, I→ı."""
return s.replace('İ', 'i').replace('I', 'ı').lower()
def build_pattern(word: str) -> str:
"""Build OCR-tolerant regex for a single word or phrase."""
parts = []
for ch in word:
lower = turkish_lower(ch)
if lower in DIACRITIC_CLASSES:
parts.append(DIACRITIC_CLASSES[lower])
elif ch == ' ':
parts.append(WORD_SEP)
elif ch.isalpha():
# Plain ASCII letter — case-insensitive
parts.append(f'[{ch.lower()}{ch.upper()}]')
else:
parts.append(re.escape(ch))
# Word boundaries: \b doesn't work well with character classes,
# so use lookarounds for non-letter context.
return r'(?<![\wıİşŞçÇğĞüÜöÖâÂ])' + ''.join(parts) + r'(?![\wıİşŞçÇğĞüÜöÖâÂ])'
def build_pattern_with_suffixes(word: str, suffixes: list = None) -> str:
"""Build pattern allowing optional Turkish suffixes."""
base = build_pattern(word)
# Strip trailing boundary, add suffix group, re-add boundary
base_no_end = base[:-len(r'(?![\wıİşŞçÇğĞüÜöÖâÂ])')]
if suffixes:
suffix_alts = '|'.join(re.escape(s) for s in suffixes)
suffix_group = rf'(?:{suffix_alts})?'
return base_no_end + suffix_group + r'(?![\wıİşŞçÇğĞüÜöÖâÂ])'
return base
def compile_keyword_set(keyword_data: dict) -> list:
"""
Compile a YAML keyword set into a list of (label, weight, regex) tuples.
Higher-weight matches surface first in reports.
"""
compiled = []
# Main keywords
for kw in keyword_data.get('keywords', []):
canonical = kw['canonical']
aliases = kw.get('aliases', [])
suffixes = kw.get('suffixes', [])
weight = kw.get('weight', 3)
for term in [canonical] + aliases:
try:
pat = build_pattern_with_suffixes(term, suffixes)
compiled.append((canonical, weight, re.compile(pat, re.IGNORECASE | re.UNICODE)))
except re.error as e:
print(f" [warn] regex compile failed for {term!r}: {e}")
# Proper nouns (smoking guns)
for pn in keyword_data.get('proper_nouns', []):
canonical = pn['canonical']
aliases = pn.get('aliases', [])
weight = pn.get('weight', 5)
for term in [canonical] + aliases:
try:
pat = build_pattern(term)
compiled.append((canonical, weight, re.compile(pat, re.IGNORECASE | re.UNICODE)))
except re.error as e:
print(f" [warn] regex compile failed for {term!r}: {e}")
return compiled
def compile_disambiguators(keyword_data: dict) -> list:
"""Compile false-positive filter patterns."""
return [
re.compile(build_pattern(term), re.IGNORECASE | re.UNICODE)
for term in keyword_data.get('disambiguators', [])
]
def is_false_positive(text: str, match_start: int, match_end: int,
disambiguators: list, window: int = 50) -> bool:
"""Check if match falls inside a disambiguator (e.g., 'Kerim Bey' near 'Kırım')."""
win_start = max(0, match_start - window)
win_end = min(len(text), match_end + window)
window_text = text[win_start:win_end]
for dis_re in disambiguators:
if dis_re.search(window_text):
return True
return False
def extract_snippet(text: str, match_start: int, match_end: int,
radius: int = 200) -> str:
"""Extract a clean ±radius snippet around a match."""
s = max(0, match_start - radius)
e = min(len(text), match_end + radius)
snip = text[s:e]
# Collapse whitespace, drop weird control chars
snip = re.sub(r'\s+', ' ', snip).strip()
snip = ''.join(c for c in snip if c.isprintable() or c in ' \n')
return snip
def co_occurrence_score(text: str, term_a: str, term_b: str,
compiled_patterns: dict, window: int = 300) -> int:
"""
Count how many times term_a and term_b appear within `window` chars of each other.
Used by report renderer for boost scoring.
"""
if term_a not in compiled_patterns or term_b not in compiled_patterns:
return 0
a_positions = [m.start() for m in compiled_patterns[term_a].finditer(text)]
b_positions = [m.start() for m in compiled_patterns[term_b].finditer(text)]
score = 0
for ap in a_positions:
for bp in b_positions:
if abs(ap - bp) <= window:
score += 1
return score
if __name__ == '__main__':
# Smoke test
test_words = ['Kırım', 'Bahçesaray', 'Cafer Seydahmet', 'İsmail Gaspıralı']
test_text = """
OCR çöplüğü:
K1r1m Tatarlari hakkinda bir haber.
Bahcesaray'da bir hadise.
K~r~m Hanl1g1 tarihi.
Cafer Seydamet Bey istanbula geldi.
Ismail Gaspirali'nin 1934 anma toplantisi.
Kerim Bey ile karistirma — bu yanlis pozitif.
"""
for w in test_words:
pat = build_pattern(w)
print(f"\n{w!r}{pat}")
for m in re.finditer(pat, test_text, re.IGNORECASE):
print(f" hit: {m.group(0)!r} @ {m.start()}")

View File

@@ -0,0 +1,3 @@
requests>=2.31
PyYAML>=6.0
python-docx>=1.0

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Run the EKOS PDF searcher inside a transient systemd user-unit with
# CPU + memory caps. All extra args are forwarded to 02_search_pdfs.py.
#
# Profile env vars (override before invocation):
# EKOS_CPU_QUOTA default 300% (3 cores)
# EKOS_MEM_MAX default 3G
# EKOS_UNIT default ekos-search-<timestamp>
#
# Examples:
# bash scripts/run_capped.sh --slug cumhuriyet --priority-only --year-to 1931 --workers 2
# EKOS_CPU_QUOTA=500% EKOS_MEM_MAX=4G bash scripts/run_capped.sh --priority-only --workers 4
#
# Monitor:
# systemctl --user status <unit>
# journalctl --user -u <unit> -f
# systemctl --user stop <unit>
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PY="${HERE}/.venv/bin/python"
SCRIPT="${HERE}/scripts/02_search_pdfs.py"
CPU_QUOTA="${EKOS_CPU_QUOTA:-300%}"
MEM_MAX="${EKOS_MEM_MAX:-3G}"
UNIT="${EKOS_UNIT:-ekos-search-$(date +%Y%m%d-%H%M%S)}"
if [[ ! -x "$PY" ]]; then
echo "venv not found: $PY" >&2
echo "create with: cd $HERE && python3 -m venv .venv && .venv/bin/pip install -r scripts/requirements.txt" >&2
exit 1
fi
echo "Unit: $UNIT"
echo "CPUQuota: $CPU_QUOTA"
echo "MemoryMax: $MEM_MAX"
echo "Forward: $*"
echo
exec systemd-run --user \
--unit="$UNIT" \
--working-directory="$HERE" \
-p "CPUQuota=$CPU_QUOTA" \
-p "MemoryMax=$MEM_MAX" \
-p "MemorySwapMax=1G" \
-p "Nice=10" \
-p "IOWeight=50" \
--setenv=PYTHONUNBUFFERED=1 \
"$PY" "$SCRIPT" "$@"

View File

@@ -0,0 +1,229 @@
---
name: telegram
description: Use when reading, searching, sending, or managing Telegram messages and folders for the user's personal account. Triggers on "Telegram'a mesaj gönder", "şu kanaldan son mesajları getir", "Telegram'da ara", "okunmamış mesajlar", "Telegram klasörlerini güncelle", "yeni kanalları kategorize et", "fetch telegram dialogs", "telegram inbox", "@username'e şunu yaz", or any direct mention of fetch_all/tg_read/tg_send/tg_search/tg_inbox/apply_folders. Also covers the Telethon-based pipeline at /home/salva/Documents/telegram (auth, session, channels.json, assignments.json).
---
# Telegram Operator (Telethon)
## Overview
Read, search, send, manage, and organize the user's Telegram personal account from the command line via Telethon. All scripts share one venv and one `.session` file at `/home/salva/Documents/telegram/`.
```
┌── tg_read.py (fetch from a chat)
├── tg_send.py (send text/file, reply, silent)
Telethon client ────┼── tg_search.py (global / scoped search)
(one .session) ├── tg_inbox.py (unread overview, mark-read)
└── folder pipeline:
fetch_all.py → build_assignments.py → apply_folders.py
```
## Project location
`/home/salva/Documents/telegram/`
| File | Role |
|---|---|
| `api.txt` | api_id / api_hash from my.telegram.org. **Do not commit.** |
| `config.py` | Loads creds → `API_ID`, `API_HASH`, `SESSION_NAME` |
| `telegram_session.session` | Telethon SQLite session. **Do not delete unless re-login needed.** |
| `venv/` | Project venv, activate with `source venv/bin/activate` |
| `requirements.txt` | `telethon>=1.43.1` |
| `tg_utils.py` | Shared helpers: `resolve_chat`, `fmt_msg`, `confirm`, `parse_date` |
| `tg_read.py` | Read messages from a chat |
| `tg_send.py` | Send text and/or file (interactive confirm by default) |
| `tg_search.py` | Search messages globally or in one chat |
| `tg_inbox.py` | Unread overview + mark-as-read (single or bulk) |
| `fetch_all.py` | Snapshot all dialogs + 40 messages each → `data/channels.json` |
| `build_assignments.py` | Static id→folder map → `data/assignments.json` |
| `apply_folders.py` | Push folder layout to Telegram (interactive y/N) |
| `categorize.py` | Library helper used if pipeline grows beyond static dict |
| `data/` | All JSON outputs (channels, assignments, compact, names.tsv, …) |
## Setup
```bash
cd /home/salva/Documents/telegram
source venv/bin/activate # or: python3 -m venv venv && pip install -r requirements.txt
```
First run on a new machine: any script will prompt for phone number → SMS code → 2FA password (if set), then writes `telegram_session.session`. Subsequent runs are silent.
If you see `AuthKeyUnregisteredError` or `SessionPasswordNeededError` after a long absence: delete the `.session` file and re-login.
## Common chat references
Every script accepts the same `chat` argument forms:
- `"@username"` — public username (channels, bots, users)
- `12345` or `-1001234567890` — numeric id (positive = user, negative = group/channel)
- `"some name"` — case-insensitive substring of the dialog name; errors out if 0 or >1 matches, listing the candidates
- `"me"` / `"self"` — Saved Messages (your own DM-to-self chat)
## Reading
```bash
python tg_read.py "@durov" # last 20 messages
python tg_read.py "Born2beroot" --limit 100
python tg_read.py "@channel" --since 2026-04-01 # since a date
python tg_read.py "@x" --search "CVE" # filter inside the chat
python tg_read.py "@x" --json # machine-readable output
python tg_read.py "@x" --mark-read # also clear the unread badge
```
Output format: `id │ YYYY-MM-DD HH:MM │ sender(20) │ text(200)` — one row per message, `--json` flips to a JSON array with `{id,date,sender_id,text,has_media,reply_to}` per item.
## Sending
```bash
python tg_send.py "@user" "Hello" # interactive [y/N]
python tg_send.py "me" "note to self" --yes # auto-confirm
python tg_send.py "@chan" "Caption" --file report.pdf
python tg_send.py "@x" "" --file img.png --caption "ss" # file-only
python tg_send.py "@x" "Reply" --reply-to 12345
python tg_send.py "@x" "ping" --silent # no notification
python tg_send.py "@x" "<b>bold</b>" --parse html
```
**Send safety policy** — by default, `tg_send.py` prints a preview of the destination + payload and asks `Gönder? [y/N]` before transmitting. Pass `--yes` (or `-y`) to skip the prompt for scripted/automated runs. This matches the convention used by `apply_folders.py`.
Default text parse mode is **markdown**. Use `--parse html` for HTML-style entities (`<b>`, `<i>`, `<a href=…>`), or `--parse none` for plain.
## Searching
```bash
python tg_search.py "ransomware" # global, last 50 hits
python tg_search.py "Putin" --since 2026-04-01 -n 200
python tg_search.py "kitap" --chat "E Kitap PDF" # scoped to one chat
python tg_search.py "report" --chat me
```
Global search uses Telegram's server-side message index. Each hit is prefixed with the chat's title in `[brackets]`. Scoped search (`--chat`) is faster and avoids the per-chat title resolution lookup.
## Inbox / unread management
```bash
python tg_inbox.py # ranked by unread count
python tg_inbox.py --top 20
python tg_inbox.py --include-archived # include archived folder
python tg_inbox.py --mark-read "Born2bero" # clear ONE chat
python tg_inbox.py --mark-all-read # clear EVERY unread (asks y/N)
python tg_inbox.py --mark-all-read --yes # … or skip prompt
```
The bulk `--mark-all-read` is destructive on the unread badge state and irreversible — there is no "mark-as-unread" RPC. The script always confirms unless `--yes`.
## Folder pipeline (≈600 dialogs → 9 folders)
3-stage workflow for organizing dialogs into Telegram client-side folders:
```bash
python fetch_all.py # ~1-3 min, refreshes data/channels.json
python build_assignments.py # warns about ⚠ unassigned ids
# → if warnings: edit build_assignments.py:A, add the new ids, rerun
python apply_folders.py # interactive y/N to push to Telegram
```
### Folder schema (current — titles capped at 12 chars by Telegram)
| Emoji | Title | Scope |
|---|---|---|
| 🛡 | `Güvenlik` | Cybersec, hacking, intel feeds, OSINT, ham radio |
| ☁ | `Logs & Cloud` | Cloud account dumps, ULP/redline logs, cracked services |
| ⚔ | `Rus-Ukrayna` | Russia/Ukraine war channels, both sides + Western trackers |
| 🕌 | `Ortadoğu` | Middle East news (Arabic/Persian/Turkish/English) |
| 🎖 | `Askeri Jeo` | Turkish military, geopolitics, MGK, defense industry |
| 📚 | `E-Kitap` | E-books, audiobooks, manga, KPSS/YKS material |
| 🌐 | `Dil & Kurs` | Russian/Swahili/English language groups, Udemy/PacktPub |
| 📈 | `Finans` | Borsa İstanbul, trading, stock tips, central bank |
| 💬 | `Sosyal` | Twitch, social, hobby groups, anything else |
Full id→folder map: `build_assignments.py:A` (~260 entries). Edit the dict, **never** edit `data/assignments.json` directly — `build_assignments.py` regenerates it.
### New-channel triage (unassigned id heuristic)
When `build_assignments.py` reports `⚠ assignment eksik`, read the channel's name and first messages from `data/channels.json`, then assign by these rules (first match wins):
```
HACK / CVE / exploit / SOC / OSINT / red team / siber → Güvenlik
cloud-free / ulp / redline / cracked / leaked logs / vbv → Logs & Cloud
Ukraine / Russia / Donbas / Kyiv / Москва (war context) → Rus-Ukrayna
Arabic-script (ar/fa) news, Israel/Gaza/Syria/Iran → Ortadoğu
TSK / SİHA / NATO / geopolitics / military doctrine → Askeri Jeo
PDF / kitap / e-book / sesli kitap / manga / KPSS / YKS → E-Kitap
Udemy / Coursera / Russian/Swahili/Arabic/French/IELTS → Dil & Kurs
borsa / hisse / trading / forex / kripto → Finans
twitch / hobby / chat / barahol / banter → Sosyal
```
Edge cases:
- Russia/Ukraine **doctrine** (not war news) → `Askeri Jeo`, not `Rus-Ukrayna`.
- Stock-tip Udemy channels → `Finans`, not `Dil & Kurs`.
- Sesli Kitap / Manga / KPSS folded into `E-Kitap`.
## Telethon API constraints
- `DialogFilter.id` 0 and 1 are reserved (All Chats, etc.); `apply_folders.py` skips them.
- Folder titles capped at **12 characters** by Telegram. Telegram allows up to 30 folders (100 with Premium); current schema uses 9.
- `iter_dialogs(archived=None)` returns both normal and archived; `archived=False` (default in `tg_inbox.py`) returns only normal.
- `iter_messages(entity, search=...)` is server-side full-text; `iter_messages(None, search=...)` is the global search.
- Rate limits: don't run `fetch_all.py` more than ~once per hour for accounts with many dialogs (`FloodWaitError`). For sending in tight loops, sleep ≥1s between messages or be ready to handle `FloodWait`.
- `client.send_read_acknowledge(entity)` clears unread; there is no inverse RPC to mark unread.
## Auth & secrets
- `api.txt` and `telegram_session.session` are **as good as a password**: anyone with both can read all your messages and send as you. Keep them out of git, dotfiles sync, and shared backups.
- The MTProto session is bound to the device fingerprint Telethon presents. Telegram → Settings → Devices lists active sessions; revoke "TelegramTUI / linux" entries you don't recognize.
- 2FA password (cloud password) is **not** stored in `.session`; you'll be prompted on first login if it's set.
## When NOT to run
- `apply_folders.py` overwrites each folder's `include_peers` — manual folder rearrangements in the Telegram client are lost. Always confirm before pushing.
- `tg_send.py` and `tg_inbox.py --mark-all-read` are destructive in the "user-visible-side-effect" sense; default behavior is interactive confirm. Don't `--yes` blindly in a script unless the destination/payload is hard-coded and reviewed.
- `fetch_all.py` more than ~hourly: triggers `FloodWaitError` for large accounts.
## Snippets cookbook
```python
# One-off custom run inside the same venv:
import asyncio
from telethon import TelegramClient
from config import API_ID, API_HASH, SESSION_NAME
from tg_utils import resolve_chat, fmt_msg
async def main():
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as c:
e = await resolve_chat(c, "@durov")
async for m in c.iter_messages(e, limit=5):
print(fmt_msg(m))
asyncio.run(main())
```
```python
# Live monitoring (event handler):
from telethon import events
@client.on(events.NewMessage(chats=["@channel1", "@channel2"]))
async def handler(event):
print(event.chat.title, event.message.text)
client.run_until_disconnected()
```
```python
# Forward N messages from A to B:
msgs = await client.get_messages(src_entity, ids=[101, 102, 103])
await client.forward_messages(dst_entity, msgs)
```
```python
# Download all media from a chat into ./media/:
async for msg in client.iter_messages(entity, limit=100):
if msg.media:
await msg.download_media(file="media/")
```
## Related skills
- `obsidian-tasks` — track Telegram-organization items as tasks.
- `news-crawler`, `freshrss`, `freshrss-reader` — alternative news ingestion paths; `Askeri Jeo`/`Ortadoğu` Telegram channels overlap with FreshRSS feeds.
- `obsidian-linux` — once messages are extracted, can convert into vault notes via `notesmd-cli`.

View File

@@ -0,0 +1,98 @@
"""
ADIM 3 — data/assignments.json'daki 10 klasörü Telegram'da oluştur/güncelle.
assignments.json formatı:
{
"folders": [{"title": "...", "emoticon": "🛡"}, ...],
"assignments": {"<channel_id>": "FolderTitle", ...}
}
"""
import asyncio
import json
from collections import defaultdict
from pathlib import Path
from telethon import TelegramClient
from telethon.tl.functions.messages import (
GetDialogFiltersRequest,
UpdateDialogFilterRequest,
)
from telethon.tl.types import DialogFilter, TextWithEntities
from config import API_HASH, API_ID, SESSION_NAME
DATA_FILE = Path(__file__).parent / "data" / "assignments.json"
def _title_text(f) -> str:
t = getattr(f, "title", None)
if t is None:
return ""
return t.text if hasattr(t, "text") else str(t)
async def main() -> None:
cfg = json.loads(DATA_FILE.read_text(encoding="utf-8"))
folders_meta = cfg["folders"] # sıralı, emoji'li
assignments: dict[str, str] = cfg["assignments"] # "id" -> title
buckets: dict[str, list[int]] = defaultdict(list)
for sid, title in assignments.items():
buckets[title].append(int(sid))
print("Önizleme:")
for f in folders_meta:
n = len(buckets.get(f["title"], []))
print(f" {f['emoticon']} {f['title']:<22} {n:>3} sohbet")
if input("\nTelegram'a uygula? [y/N]: ").strip().lower() != "y":
print("iptal.")
return
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
resp = await client(GetDialogFiltersRequest())
existing = resp.filters if hasattr(resp, "filters") else resp
by_title: dict[str, DialogFilter] = {}
used_ids: set[int] = {0, 1}
for f in existing:
if isinstance(f, DialogFilter):
by_title[_title_text(f)] = f
used_ids.add(f.id)
next_id = max(used_ids) + 1
for fmeta in folders_meta:
title = fmeta["title"]
ids = buckets.get(title, [])
include_peers = []
for cid in ids:
try:
include_peers.append(await client.get_input_entity(cid))
except Exception as e:
print(f" ! {cid} eklenemedi: {e}")
if title in by_title:
fid = by_title[title].id
action = "güncellendi"
else:
fid = next_id
next_id += 1
action = "oluşturuldu"
df = DialogFilter(
id=fid,
title=TextWithEntities(text=title, entities=[]),
pinned_peers=[],
include_peers=include_peers,
exclude_peers=[],
contacts=False, non_contacts=False, groups=False,
broadcasts=False, bots=False,
exclude_muted=False, exclude_read=False, exclude_archived=False,
emoticon=fmeta.get("emoticon"),
)
await client(UpdateDialogFilterRequest(id=fid, filter=df))
print(f"{fmeta['emoticon']} {title} ({len(include_peers)}) — {action}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,296 @@
"""Claude'un elle yaptığı kategorizasyon → data/assignments.json."""
import json
from pathlib import Path
# Telegram klasör ismi en fazla 12 karakter.
FOLDERS = [
{"title": "Güvenlik", "emoticon": "🛡"},
{"title": "Logs & Cloud", "emoticon": ""},
{"title": "Rus-Ukrayna", "emoticon": ""},
{"title": "Ortadoğu", "emoticon": "🕌"},
{"title": "Askeri Jeo", "emoticon": "🎖"},
{"title": "E-Kitap", "emoticon": "📚"}, # Sesli Manga + KPSS da burada
{"title": "Dil & Kurs", "emoticon": "🌐"},
{"title": "Finans", "emoticon": "📈"}, # yeni: borsa/trading
{"title": "Sosyal", "emoticon": "💬"}, # kalan sosyal/eğlence
]
# id → folder title
A = {
# --- Siber Güvenlik ---
-1003772746107: "Güvenlik", # Born2beroot
-1001182095274: "Güvenlik", # Cyber Threat Intelligence Feeds
-1001601457644: "Güvenlik", # Linux Türkiye Topluluğu
-5245874036: "Güvenlik", # APT10
-1001424015690: "Güvenlik", # Siber Kulüpler Birliği
-1002044403490: "Güvenlik", # Siber Güvenlik Turkey
-1001433765532: "Güvenlik", # Türkiye Amatör Telsiz
-1001448773154: "Güvenlik", # tinyGS Community
-1001248961775: "Güvenlik", # Geek Hacker
-1001820205147: "Güvenlik", # PSD
-1001486620605: "Güvenlik", # OpenStreetMap Türkiye
-1001224374951: "Güvenlik", # Siberdinc
-1001175709038: "Güvenlik", # Özgür Yazılım Derneği
-1001102366261: "Güvenlik", # Dark Web Intelligence
-1001705864902: "Güvenlik", # HackCodeRepeat
-4895889925: "Güvenlik", # Born2beroot (küçük)
-1001560793071: "Güvenlik", # ForenSec
-1001099338447: "Güvenlik", # burpsuite (unofficial)
-1002019701877: "Güvenlik", # sc_sibermagazin
-1001968311017: "Güvenlik", # CVE
-4762809106: "Güvenlik", # RaCONF'25
-4960718009: "Güvenlik", # OpZ
-1001425186624: "Güvenlik", # Zer0Day Lab
-1001369540037: "Güvenlik", # inj3ct0r exploit db
-1002601559408: "Güvenlik", # Garuda Error System
-1002389372004: "Güvenlik", # AnonSec (hacker crew)
# --- Logs & Cloud ---
-1002696769378: "Logs & Cloud", # Valide Cloud Free
-1001628710143: "Logs & Cloud", # Omega Cloud
-1001921972180: "Logs & Cloud", # Burn Cloud
-1001939548708: "Logs & Cloud", # Trident Cloud
-1001602298018: "Logs & Cloud", # Free xbox game pass
-1002231096661: "Logs & Cloud", # Vpesports Xbox
-1002592627432: "Logs & Cloud", # Cvv190 Cloud
-1002107853176: "Logs & Cloud", # Plutonium logs
-1002047552897: "Logs & Cloud", # Бесплатный лицензионный
-1002575521311: "Logs & Cloud", # Darknes-Cloud
-1002355411584: "Logs & Cloud", # Roves Cloud
-1002025418650: "Logs & Cloud", # Valide Cloud FREE
-1002849195507: "Logs & Cloud", # azef cloud
-1001440229722: "Logs & Cloud", # Freedom F0x
-1002294768789: "Logs & Cloud", # D49d3k ULP-Cloud
-1002415889954: "Logs & Cloud", # scale invite
-1001773319933: "Logs & Cloud", # CRYPTOLOGS REDLINE
-1001578557816: "Logs & Cloud", # Link Arşivleri
-1001672949739: "Logs & Cloud", # BerserkLogs
# --- Rus-Ukrayna ---
-1001668977160: "Rus-Ukrayna", # Rybar in English
-1001326223284: "Rus-Ukrayna", # Рыбарь
-1001082968817: "Rus-Ukrayna", # Минобороны России
-1001513431778: "Rus-Ukrayna", # Два майора
-1001475819126: "Rus-Ukrayna", # Роскосмос
-1001220606936: "Rus-Ukrayna", # STERNENKO
-1001783035076: "Rus-Ukrayna", # TrackANaziMerc
-1001003313758: "Rus-Ukrayna", # Новости Москвы
-1001654562332: "Rus-Ukrayna", # TASS
-1001386375324: "Rus-Ukrayna", # МВС України
-1001173684180: "Rus-Ukrayna", # ЧП / Крым
-1001310984791: "Rus-Ukrayna", # Intel Slava
-1001747148099: "Rus-Ukrayna", # Судоплатов
-1002121256650: "Rus-Ukrayna", # Угруповання об'єднаних сил
-1001583313036: "Rus-Ukrayna", # АРХАНГЕЛ СПЕЦНАЗА
-1001352726486: "Rus-Ukrayna", # INSIDER UA
-1001669110938: "Rus-Ukrayna", # UNITED24Media
-1001350274993: "Rus-Ukrayna", # Tim Kirby Russia Hardcore
-1001463721328: "Rus-Ukrayna", # Zelenskiy Official
-1001117303064: "Rus-Ukrayna", # Россия в глобальной политике
-1001509172593: "Rus-Ukrayna", # monitorwar
-1001222633586: "Rus-Ukrayna", # FEDOROV
-1001469021333: "Rus-Ukrayna", # DeepState
-1001616052141: "Rus-Ukrayna", # Проект «Хочу жить»
-1001900958834: "Rus-Ukrayna", # Ігор Клименко МВС
-1001617325371: "Rus-Ukrayna", # Десантно-штурмові війська ЗСУ
-1002490955621: "Rus-Ukrayna", # DIPLOMATIE RUSSE
-1001385909762: "Rus-Ukrayna", # Артем Дмитрук
-1001764041965: "Rus-Ukrayna", # Kremlin News EN
-1001790907266: "Rus-Ukrayna", # Кремль Новости RU
-1003222724492: "Rus-Ukrayna", # Ionfall
-1001936622736: "Rus-Ukrayna", # ЖАХ З НЕБЕС 123
-1002029042694: "Rus-Ukrayna", # 123 омсбр
-1002051535105: "Rus-Ukrayna", # 114 Бригада
# --- Ortadoğu ---
-1002062736232: "Ortadoğu", # نايا - NAYA
-1002059959435: "Ortadoğu", # UAE MoD (multilingual AR)
-1001272529767: "Ortadoğu", # Middle East News
-1001822461311: "Ortadoğu", # JHArnous
-1002263475135: "Ortadoğu", # Syrian FM
-1001226363458: "Ortadoğu", # Stay Free
-1001048133085: "Ortadoğu", # تَأكّدْ
-1001081687249: "Ortadoğu", # مركز الزيتونة AR
-1001147346052: "Ortadoğu", # Al-Zaytouna EN
-1002142228056: "Ortadoğu", # Elly_bar Israel-Hamas
-1001797479924: "Ortadoğu", # بيان نيوز
-1001180533415: "Ortadoğu", # Orient - أورينت
-1001463836083: "Ortadoğu", # Suriye Milli Ordusu
-1002280669663: "Ortadoğu", # خیابون انقلاب
-1002450267230: "Ortadoğu", # خیابون انقلاب (dup)
# --- Askeri & Jeopolitik ---
-1001173129471: "Askeri Jeo", # AZERTAC
-1002143761332: "Askeri Jeo", # Askeri İstihbarat Sohbet
-1001508782705: "Askeri Jeo", # 3. Dünya Savaşı
-1001802903419: "Askeri Jeo", # Askeri İstihbarat TR
-1001220118870: "Askeri Jeo", # Enformasyon
-1001251299061: "Askeri Jeo", # SouthFront
-1001699619673: "Askeri Jeo", # The Grayzone
-1001689501969: "Askeri Jeo", # Fokus+
-1001734228215: "Askeri Jeo", # People's Daily China
-1001810182217: "Askeri Jeo", # Rerum Novarum
-1002642181270: "Askeri Jeo", # Gallipoli General
-1001834311682: "Askeri Jeo", # SOFTAÇAM
-1001857092414: "Askeri Jeo", # FahrettinAltay_
-1002334106447: "Askeri Jeo", # Source News
-1001601338144: "Askeri Jeo", # ASKERİ HARP
-1002388640996: "Askeri Jeo", # Military Vibe
-1001055365200: "Askeri Jeo", # Nairobi News
-1001127820109: "Askeri Jeo", # Bellingcat
-990795574: "Askeri Jeo", # Milli Güvenlik Kurulu
-1001381692248: "Askeri Jeo", # Rusya Ankara Büyükelçiliği
# --- E-Kitap ---
-1001968002316: "E-Kitap", # E Kütüphanem
-1001295770478: "E-Kitap", # Kitap Turşusu Premium
-1001273763604: "E-Kitap", # Kitap Evreni
-1001176839029: "E-Kitap", # e-kitap yardımlaşma
-1003179138041: "E-Kitap", # Kitap Botu PDF
-1001948357383: "E-Kitap", # PDF E Kitap İstek
-1001267622915: "E-Kitap", # E Kitap Grup
-1003339908160: "E-Kitap", # Kitap Arama Grubu
-1001884485811: "E-Kitap", # Kitaplık Rafı
-1001219338945: "E-Kitap", # e-Babil Kütüphanesi
-1002761890261: "E-Kitap", # E kitap Roman PDF
-1001379065337: "E-Kitap", # Dijital Kitap
-1001436274859: "E-Kitap", # E-Kitap Oku
-1002231474242: "E-Kitap", # E - Kitap PDF
-1001837236620: "E-Kitap", # E-Kitap Paylaşım Sohbet
-1001896451121: "E-Kitap", # Kitap Modu
-1002123805391: "E-Kitap", # Kitap PDF Arşivi Roman Hikaye
-1001651874667: "E-Kitap", # E Kitap PDF
-1001869548408: "E-Kitap", # Aranan Kitapçık duyuru
-1001741842267: "E-Kitap", # PDF Kitap Evreni
-1002844665098: "E-Kitap", # PDF KİTAP
-1001379762150: "E-Kitap", # Atatürk Pdf Kitap
-1001380972711: "E-Kitap", # BUNDLE Kitap epub pdf
-1002677555843: "E-Kitap", # Büyük Kitap Arşivi
-1002084828902: "E-Kitap", # Kitapçı PDF Arşivi
-1002502833110: "E-Kitap", # PDF KİTAP ROMAN HİKAYE
-1002969079664: "E-Kitap", # PDF KİTAP ARŞİV
-1003491537567: "E-Kitap", # YATIRIM KİTAPLARI
-1001625595378: "E-Kitap", # Kütübhâne-i Tevârîh
-1001616159980: "E-Kitap", # Books
-1001916886683: "E-Kitap", # PDF Kitap İndir
-1002066019978: "E-Kitap", # PDF Kitap Yurdu
-1002233958112: "E-Kitap", # PDF Kitaplar pdfstok
-1002739085389: "E-Kitap", # Telegram Kitap Grupları
-1003450748883: "E-Kitap", # KÜTÜPHANE
# --- Sesli & Manga ---
-1001651817526: "E-Kitap" , # RiF Новеллы, ранобэ и фф
-1001559096136: "E-Kitap" , # Sesli Kitap
-1002267174397: "E-Kitap" , # Aaron Arşiv
-1001851524017: "E-Kitap" , # Hentai TV
-1003037921710: "E-Kitap" , # Sesli Kitap Storytel
-1003026138059: "E-Kitap" , # Sesli Kitap Dinlio
-1003106544769: "E-Kitap" , # SESLİ KİTAP EDEBİYAT
-1003483217842: "E-Kitap" , # MANGA KİTAPLARI
-1002269816836: "E-Kitap" , # Anime Maniaxx
-1003179794694: "E-Kitap" , # Dergi PDF Arşivi
-1001519763115: "E-Kitap" , # Sesli Kitap Dinle
-1003417275151: "E-Kitap" , # ÇİZGİ ROMAN KİTAPLARI
# --- KPSS & YKS ---
-1002788272998: "E-Kitap" , # AGS KPSS PDF
-1002967115062: "E-Kitap" , # YDS YÖKDİL
-1002335523660: "E-Kitap" , # KPSS YKS KİTAP PDF
-1002164684267: "E-Kitap" , # Yks PDF AYT TYT
-1003029282639: "E-Kitap" , # KİTAP PDF YKS KPSS
-1003332920930: "E-Kitap" , # SINAV KAYNAKLARI
# --- Dil & Kurs ---
-1001279165634: "Dil & Kurs", # Udemy Courses Free
-1001498152897: "Dil & Kurs", # Eduonix Courses Free
-1001005463014: "Dil & Kurs", # PacktPub Free Learning
-1001044241441: "Dil & Kurs", # Books Mania (grammar)
-1002973548671: "Dil & Kurs", # RUSÇA ÖĞREN SOHBET
-1001541869122: "Dil & Kurs", # I speak russian
-1001205656183: "Dil & Kurs", # Russian Microlearning
-1001262177780: "Dil & Kurs", # Russian With Max
-1002374924223: "Dil & Kurs", # LLama Russian Study
-1001475363663: "Dil & Kurs", # LEARN SWAHILI
-1001654101128: "Dil & Kurs", # Russian for lunch
-1001912229645: "Dil & Kurs", # Russian home
-1001933331449: "Dil & Kurs", # Study Russian
-1002647054427: "Dil & Kurs", # Tutorial for new joiners
-1003023929968: "Dil & Kurs", # RUSÇA ÖĞRENİYORUM 2025
-1001159423770: "Dil & Kurs", # English Books Magazines Novels
-4509421355: "Dil & Kurs", # Russian LLama
-1001612802963: "Dil & Kurs", # Vitabu vya Kiislamu (Swahili)
-1001807530830: "Dil & Kurs", # Ankara Rus Evi
# --- Sosyal & Diğer ---
-1001379307100: "Sosyal" , # Ламповая беседка
-1001887551302: "Sosyal" , # ТРОЕТОЧИЕ
-1001492338580: "Sosyal" , # Sirius poets
-1001751338081: "Sosyal" , # Geometric Telegramssion
-1001760743689: "Sosyal" , # Квартал красных фонарей
-1001714372021: "Sosyal" , # Kaktüs v2.0
-1002464236122: "Sosyal" , # Malvinkin Twitch
-1001865528673: "Sosyal" , # Fiftnmls
-1002466936546: "Finans" , # İnfo Yatırım Hisse
-1001961199646: "Sosyal" , # аничух (twitch)
-1003321701261: "Finans" , # ADRENALİN TRADE
-1001613153861: "Sosyal" , # FOŞİX ERLİK
-1001495437712: "Sosyal" , # Erlik Video Deposu
-1001363595671: "Finans" , # Advo
-1001874359773: "Finans" , # Udemy Türkçe (stock tips)
-1001591294939: "Finans" , # Hazine-i BORSA
-4789484210: "Finans" , # Trade
-1002517056894: "Sosyal" , # Барахолка Москва
-1003222134628: "Sosyal" , # GAME CHILL
-1001476005114: "Finans" , # cBank
-901188134: "Sosyal" , # CHP GENÇLİK
-4543354861: "Sosyal" , # Atatürkçüler Birliği
-693237968: "Sosyal" , # İzmir Kavram
-525645675: "Sosyal" , # GB - Jeoloji
-567627579: "Sosyal" , # GB - Psikoloji
-562687596: "Sosyal" , # GB - tarih
-537589148: "Sosyal" , # GB - tıp
-541262586: "Sosyal" , # GB - kimya
-516377683: "Sosyal" , # GB - biyoloji
-500211559: "Sosyal" , # GB - mühendislik
}
def main() -> None:
here = Path(__file__).parent
channels = json.loads((here / "data" / "channels.json").read_text(encoding="utf-8"))
channel_ids = {c["id"] for c in channels}
assigned_ids = set(A.keys())
missing = channel_ids - assigned_ids
extra = assigned_ids - channel_ids
if missing:
print("⚠ assignment eksik:")
for mid in missing:
name = next((c["name"] for c in channels if c["id"] == mid), "?")
print(f" {mid} {name!r}")
if extra:
print("⚠ assignment'da fazladan ID var:", extra)
assignments_str = {str(k): v for k, v in A.items()}
counts: dict[str, int] = {}
for v in A.values():
counts[v] = counts.get(v, 0) + 1
out = here / "data" / "assignments.json"
out.write_text(
json.dumps(
{"folders": FOLDERS, "assignments": assignments_str},
ensure_ascii=False, indent=2,
),
encoding="utf-8",
)
print(f"\n{len(A)} atama → {out}")
for f in FOLDERS:
n = counts.get(f["title"], 0)
print(f" {f['emoticon']} {f['title']:<22} {n:>3}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,35 @@
"""
ADIM 2 — Sınıflandırma sonucunu (ID → klasör) data/assignments.json'dan okur.
assignments.json formatı:
{
"folders": ["Klasör1", "Klasör2", ...], # tam 10 tane
"assignments": { "<channel_id>": "Klasör1", ... }
}
Bu dosyayı Claude (ben) data/channels.json'ı analiz edip üretir.
"""
from __future__ import annotations
import json
from pathlib import Path
_ASSIGN_FILE = Path(__file__).parent / "data" / "assignments.json"
_cache: dict | None = None
def _load() -> dict:
global _cache
if _cache is None:
if not _ASSIGN_FILE.exists():
raise FileNotFoundError(
f"{_ASSIGN_FILE} yok. Önce data/channels.json üretilmeli, "
"sonra Claude assignments.json'u yazacak."
)
_cache = json.loads(_ASSIGN_FILE.read_text(encoding="utf-8"))
return _cache
def categorize(channel: dict) -> str | None:
data = _load()
return data["assignments"].get(str(channel["id"]))

View File

@@ -0,0 +1,9 @@
import re
from pathlib import Path
_API_TXT = Path(__file__).parent / "api.txt"
_text = _API_TXT.read_text(encoding="utf-8")
API_ID = int(re.search(r"api_id:\s*\n?\s*(\d+)", _text).group(1))
API_HASH = re.search(r"api_hash:\s*\n?\s*([a-f0-9]+)", _text).group(1)
SESSION_NAME = str(Path(__file__).parent / "telegram_session")

View File

@@ -0,0 +1,63 @@
"""
ADIM 1 — Her grup/kanal + son mesajları çek, data/channels.json'a kaydet.
Arşivli olanlar da dahil (iter_dialogs(archived=None) hem normal hem arşivli getirir).
"""
import asyncio
import json
from pathlib import Path
from telethon import TelegramClient
from config import API_ID, API_HASH, SESSION_NAME
DATA_DIR = Path(__file__).parent / "data"
OUTPUT = DATA_DIR / "channels.json"
MESSAGE_SAMPLE = 40 # her kanaldan kaç mesaj
MESSAGE_CHAR_LIMIT = 600 # her mesaj max uzunluk
async def main() -> None:
DATA_DIR.mkdir(exist_ok=True)
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
me = await client.get_me()
print(f"Bağlandı: @{me.username or me.first_name}\n")
results: list[dict] = []
async for d in client.iter_dialogs(archived=None):
if not (d.is_group or d.is_channel):
continue
idx = len(results) + 1
print(f"[{idx:>3}] {d.name} (arşiv={bool(d.archived)})")
messages: list[str] = []
try:
async for msg in client.iter_messages(d.entity, limit=MESSAGE_SAMPLE):
text = (msg.message or "").strip()
if text:
messages.append(text[:MESSAGE_CHAR_LIMIT])
except Exception as e:
print(f" ! mesaj çekilemedi: {e}")
results.append({
"id": d.id,
"name": d.name or "",
"type": "channel" if (d.is_channel and not d.is_group) else "group",
"is_broadcast": bool(getattr(d.entity, "broadcast", False)),
"archived": bool(d.archived),
"unread_count": d.unread_count,
"messages": messages,
})
OUTPUT.write_text(
json.dumps(results, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f"\n{len(results)} sohbet kaydedildi -> {OUTPUT}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1 @@
telethon>=1.43.1

View File

@@ -0,0 +1,77 @@
"""Show unread chats — your real inbox view.
Usage:
python tg_inbox.py # all unread, sorted by count desc
python tg_inbox.py --top 20
python tg_inbox.py --include-archived
python tg_inbox.py --mark-read "Born2bero" # zero-out a specific chat
python tg_inbox.py --mark-all-read --yes # nuke ALL unread (destructive)
"""
from __future__ import annotations
import argparse
import asyncio
from telethon import TelegramClient
from config import API_HASH, API_ID, SESSION_NAME
from tg_utils import confirm, resolve_chat
async def main() -> None:
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
p.add_argument("--top", type=int, default=0, help="show only top N (default: all)")
p.add_argument("--include-archived", action="store_true",
help="include archived dialogs (default: only normal)")
p.add_argument("--mark-read", help="mark this specific chat as read")
p.add_argument("--mark-all-read", action="store_true",
help="mark every unread chat as read (DESTRUCTIVE)")
p.add_argument("--yes", "-y", action="store_true", help="skip confirmation")
args = p.parse_args()
archived = None if args.include_archived else False
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
# Single-target mark-read
if args.mark_read:
entity = await resolve_chat(client, args.mark_read)
await client.send_read_acknowledge(entity)
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
print(f"{title} marked as read")
return
# Collect unread
unread: list[tuple[int, str, int, bool]] = []
async for d in client.iter_dialogs(archived=archived):
if d.unread_count > 0:
kind = "channel" if (d.is_channel and not d.is_group) else (
"group" if d.is_group else "user")
unread.append((d.unread_count, d.name or str(d.id), d.id, d.is_channel))
# store original Dialog for later mark-read pass
unread[-1] = (d.unread_count, d.name or str(d.id), d.id, kind)
unread.sort(reverse=True)
if args.top:
unread = unread[:args.top]
total = sum(n for n, *_ in unread)
print(f"# {len(unread)} unread chats — {total} unread messages\n")
for n, name, cid, kind in unread:
print(f" {n:>5} [{kind:<7}] {name} (id={cid})")
# Mark-all-read
if args.mark_all_read:
print()
if not args.yes and not confirm(f"Mark ALL {len(unread)} chats as read?"):
print("iptal.")
return
for _, name, cid, _ in unread:
try:
await client.send_read_acknowledge(await client.get_input_entity(cid))
print(f"{name}")
except Exception as e:
print(f" ! {name}: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,71 @@
"""Read messages from a Telegram chat.
Usage:
python tg_read.py "@username"
python tg_read.py "Born2beroot" --limit 50
python tg_read.py -1001182095274 --since 2026-04-01
python tg_read.py "@durov" --limit 5 --json
"""
from __future__ import annotations
import argparse
import asyncio
import json
from telethon import TelegramClient
from config import API_HASH, API_ID, SESSION_NAME
from tg_utils import fmt_msg, parse_date, resolve_chat
async def main() -> None:
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
p.add_argument("chat", help="@username, numeric id, or name substring")
p.add_argument("--limit", "-n", type=int, default=20, help="max messages (default 20)")
p.add_argument("--since", help="YYYY-MM-DD; only newer than this date")
p.add_argument("--search", "-s", help="filter to messages containing this text")
p.add_argument("--json", action="store_true", help="emit JSON instead of table")
p.add_argument("--mark-read", action="store_true", help="mark fetched messages as read")
args = p.parse_args()
offset_date = parse_date(args.since) if args.since else None
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
entity = await resolve_chat(client, args.chat)
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
if not args.json:
print(f"# {title} (id={entity.id})\n")
kwargs = {"limit": args.limit}
if offset_date:
kwargs["reverse"] = True
kwargs["offset_date"] = offset_date
if args.search:
kwargs["search"] = args.search
rows = []
async for msg in client.iter_messages(entity, **kwargs):
if args.json:
rows.append({
"id": msg.id,
"date": msg.date.isoformat(),
"sender_id": msg.sender_id,
"text": msg.message or "",
"has_media": msg.media is not None,
"reply_to": msg.reply_to_msg_id,
})
else:
print(fmt_msg(msg))
if args.json:
print(json.dumps(rows, ensure_ascii=False, indent=2))
if args.mark_read:
await client.send_read_acknowledge(entity)
if not args.json:
print(f"\n{title} marked as read")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,72 @@
"""Search messages — globally or scoped to one chat.
Usage:
python tg_search.py "CVE-2024" # global, last 50 hits
python tg_search.py "kitap" --chat "E Kitap" # scoped to one chat
python tg_search.py "Putin" --since 2026-04-01 --limit 100
python tg_search.py "report" --chat me # only Saved Messages
Global search uses Telegram's server-side message index (telethon
client.iter_messages(None, search=...)).
"""
from __future__ import annotations
import argparse
import asyncio
from telethon import TelegramClient
from config import API_HASH, API_ID, SESSION_NAME
from tg_utils import fmt_msg, parse_date, resolve_chat
async def main() -> None:
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
p.add_argument("query", help="search text")
p.add_argument("--chat", help="restrict to this chat (@user/id/name/me)")
p.add_argument("--limit", "-n", type=int, default=50)
p.add_argument("--since", help="YYYY-MM-DD lower bound")
args = p.parse_args()
offset_date = parse_date(args.since) if args.since else None
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
if args.chat:
entity = await resolve_chat(client, "me" if args.chat in {"me", "self"} else args.chat)
else:
entity = None
kwargs = {"search": args.query, "limit": args.limit}
if offset_date:
kwargs["reverse"] = True
kwargs["offset_date"] = offset_date
# Cache chat titles to annotate global hits.
chat_titles: dict[int, str] = {}
async def title_for(chat_id: int) -> str:
if chat_id in chat_titles:
return chat_titles[chat_id]
try:
e = await client.get_entity(chat_id)
t = getattr(e, "title", None) or getattr(e, "username", None) or str(chat_id)
except Exception:
t = str(chat_id)
chat_titles[chat_id] = t
return t
count = 0
async for msg in client.iter_messages(entity, **kwargs):
count += 1
if entity is None:
where = await title_for(msg.peer_id.channel_id) if hasattr(msg.peer_id, "channel_id") \
else await title_for(getattr(msg.peer_id, "user_id", 0) or getattr(msg.peer_id, "chat_id", 0))
print(f"[{where[:25]:<25}] {fmt_msg(msg)}")
else:
print(fmt_msg(msg))
print(f"\n{count} hit(s)")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,90 @@
"""Send a message (text and/or file) to a Telegram chat.
Usage:
python tg_send.py "@username" "Hello"
python tg_send.py "Born2beroot" "Check this" --file report.pdf
python tg_send.py "@chan" "" --file image.png --caption "screenshot"
python tg_send.py "Saved Messages" "note to self" --yes
python tg_send.py "@x" "Reply" --reply-to 12345
python tg_send.py "@x" "Quiet ping" --silent
Defaults to dry-run preview + interactive [y/N] confirm. --yes skips it.
Saved Messages is resolvable as "me" or by your own username.
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
from telethon import TelegramClient
from config import API_HASH, API_ID, SESSION_NAME
from tg_utils import confirm, resolve_chat
async def main() -> None:
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
p.add_argument("chat", help='@username, id, name; "me" for Saved Messages')
p.add_argument("text", help="message text (use '' if only sending a file)")
p.add_argument("--file", "-f", help="path to file/image to attach")
p.add_argument("--caption", help="caption for the file (overrides text if --file given)")
p.add_argument("--reply-to", type=int, help="message id to reply to")
p.add_argument("--silent", action="store_true", help="send without notification")
p.add_argument("--parse", choices=["md", "html", "none"], default="md",
help="text parse mode (default: md)")
p.add_argument("--yes", "-y", action="store_true", help="skip confirmation")
args = p.parse_args()
if not args.text and not args.file:
sys.exit("nothing to send: provide text and/or --file")
if args.file and not Path(args.file).exists():
sys.exit(f"file not found: {args.file}")
parse_mode = None if args.parse == "none" else args.parse
async with TelegramClient(SESSION_NAME, API_ID, API_HASH) as client:
entity = await resolve_chat(client, "me" if args.chat in {"me", "self"} else args.chat)
title = getattr(entity, "title", None) or getattr(entity, "username", None) or str(entity.id)
print(f"→ to: {title} (id={entity.id})")
if args.file:
print(f"→ file: {args.file}")
print(f"→ cap: {(args.caption or args.text)[:120]}")
else:
preview = args.text if len(args.text) < 200 else args.text[:200] + ""
print(f"→ text: {preview}")
if args.reply_to:
print(f"→ reply: msg #{args.reply_to}")
if args.silent:
print("→ silent: yes")
if not args.yes and not confirm("Gönder?"):
print("iptal.")
return
if args.file:
sent = await client.send_file(
entity,
args.file,
caption=args.caption or args.text or None,
reply_to=args.reply_to,
silent=args.silent,
parse_mode=parse_mode,
)
else:
sent = await client.send_message(
entity,
args.text,
reply_to=args.reply_to,
silent=args.silent,
parse_mode=parse_mode,
)
print(f"✓ sent msg id={sent.id}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,100 @@
"""Shared helpers for the tg_* CLI scripts (read/send/search/inbox)."""
from __future__ import annotations
import sys
from datetime import datetime, timezone
from typing import Iterable
from telethon import TelegramClient
async def resolve_chat(client: TelegramClient, ref: str):
"""Resolve a chat reference to a Telethon entity.
Accepts:
- "@username" or "username" (with leading '+' for invite phone-number)
- numeric id (positive or negative; large negative for supergroups)
- case-insensitive name substring; errors if 0 or >1 matches
"""
ref = ref.strip()
if ref.startswith("@") or ref.startswith("+"):
return await client.get_entity(ref)
try:
return await client.get_entity(int(ref))
except ValueError:
pass
needle = ref.lower()
matches = []
async for d in client.iter_dialogs(archived=None):
if needle in (d.name or "").lower():
matches.append(d)
if not matches:
sys.exit(f"chat not found: {ref!r}")
if len(matches) > 1:
preview = "\n ".join(f"{d.id:>15} {d.name}" for d in matches[:10])
more = "" if len(matches) <= 10 else f"\n ... +{len(matches)-10} more"
sys.exit(
f"ambiguous chat {ref!r} ({len(matches)} matches):\n "
f"{preview}{more}\nuse the numeric id or @username"
)
return matches[0].entity
def parse_date(s: str) -> datetime:
"""Parse YYYY-MM-DD or full ISO into UTC-aware datetime."""
if "T" in s or " " in s:
dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
else:
dt = datetime.strptime(s, "%Y-%m-%d")
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt
def fmt_msg(msg, max_chars: int = 200) -> str:
"""Compact one-line representation of a Telethon Message."""
sender = ""
if getattr(msg, "sender", None) is not None:
sender = (
getattr(msg.sender, "username", None)
or getattr(msg.sender, "first_name", None)
or getattr(msg.sender, "title", None)
or str(msg.sender_id)
)
elif msg.sender_id:
sender = str(msg.sender_id)
text = (msg.message or "").replace("\n", "")
if len(text) > max_chars:
text = text[: max_chars - 1] + ""
media = ""
if msg.media and not msg.message:
media = f" [media:{type(msg.media).__name__}]"
return f"{msg.id:>9}{msg.date.strftime('%Y-%m-%d %H:%M')}{sender[:20]:<20}{text}{media}"
def confirm(prompt: str = "Onayla", default: bool = False) -> bool:
"""Interactive y/N. default=False → [y/N], default=True → [Y/n]."""
suffix = " [Y/n]: " if default else " [y/N]: "
try:
r = input(prompt + suffix).strip().lower()
except EOFError:
return default
if not r:
return default
return r in ("y", "yes", "evet", "e", "ok")
def chunked(items: Iterable, size: int):
buf = []
for x in items:
buf.append(x)
if len(buf) == size:
yield buf
buf = []
if buf:
yield buf