diff --git a/.claude/skills/triage-jobs/SKILL.md b/.claude/skills/triage-jobs/SKILL.md new file mode 100644 index 0000000..12a4627 --- /dev/null +++ b/.claude/skills/triage-jobs/SKILL.md @@ -0,0 +1,168 @@ +--- +name: triage-jobs +description: Triage the latest Telegram vacancy inbox (`tracking/telegram_inbox.json`) — stratifies by priority (p1 in this session, p2/p3 via a Haiku subagent) and returns a deduped shortlist of vacancies worth applying to. Use when the user says "разбери inbox", "что нового по работе", "разбери вакансии", "пройдись по telegram-вакансиям", "triage the inbox", "find relevant jobs from telegram", or similar. +--- + +# triage-jobs + +Read the latest Telegram vacancy fetch and present a shortlist that fits Oleg's targeting. + +The inbox file is large (~200K tokens). To keep the main session lean, p2/p3 channels go to a **Haiku subagent** that returns only finalists. p1 (small, high-signal) is processed here. + +## Args + +Optional priority filter as positional arg(s): `p1`, `p2`, `p3`, or `all` (default). + +- `/triage-jobs` → all three tiers +- `/triage-jobs p1` → only p1 in this session, skip subagent +- `/triage-jobs p2 p3` → only subagent run + +## Step 1 — Pre-checks + +Verify the inbox exists and is recent: + +```bash +ls -lh tracking/telegram_inbox.json +jq '{generated_at, total_in_inbox, channels_count: (.channels | length)}' tracking/telegram_inbox.json +``` + +If the file is missing or its `generated_at` is older than ~6 hours, **don't run triage on stale data** — offer to refetch first: + +```bash +~/.local/bin/uv run scripts/list_telegram_channels.py \ + | ~/.local/bin/uv run scripts/fetch_telegram_jobs.py - +``` + +## Step 2 — Oleg's targeting (apply strictly during triage) + +This is the rubric — use it verbatim when deciding "keep or drop" and when briefing the subagent. + +**Roles he targets:** +- Senior / Staff / Principal Full-Stack Engineer +- Tech Lead, Engineering Lead, Engineering Manager (with hands-on) +- AI Engineer / Applied AI / LLM Engineer (TS or Python OK for AI roles) + +**Stack match (strong signal):** TypeScript, JavaScript, Node.js, React, Next.js, TanStack, Tailwind, PostgreSQL, Drizzle, Vercel, Cloudflare, Sanity/Storyblok/Contentful/Payload (Headless CMS), Shopify/Hydrogen, GraphQL, WebSockets. For AI roles also: LLM orchestration, MCP, RAG, embeddings, Mastra, Vercel AI SDK, Claude/GPT/Gemini APIs. + +**Culture must-haves:** +- **Global remote** (he's in GMT+7, full EMEA overlap + US East AM). EMEA / global / US-East-friendly TZ all fine. "Remote within Russia only" or "US only — must be in EST 9-5" → reject. +- **Compensation in USD/EUR** preferred. Target ~$100k+ FT or $70+/hr contractor. Russian-RUB roles at ₽70-100k/mo (≈ $750-1100) are below floor. +- Deel/W-8BEN contractor format is a plus. + +**Deal-breakers (auto-reject):** +- Mobile-native (Kotlin, Swift, Android, iOS, Flutter) +- Non-stack backend (Go/Golang, Java, .NET, C#, Ruby, PHP, Rust, Scala) **as primary** — if the role is fullstack with React/Node + Go on side, that's fine +- DevOps / SRE as primary role +- QA / Manual testing +- Sales, Marketing, Designer, Recruiter, PM (non-engineering) +- Junior / Trainee / Intern +- On-site outside major remote-friendly hubs (e.g. Lagos, low-cost-region on-site) +- Sub-$50k FT compensation when the salary is stated + +**Stretch interests (consider even if not perfect match):** +- AI/ML engineering roles using Python (his AI CV covers this) +- Vetted-contractor platforms (Toptal, Lemon.io, Turing) — separate financial track +- Headless CMS, Shopify Hydrogen, eCommerce platforms +- Roles at companies building dev tooling, AI agents, MCP ecosystem (his open-source overlaps) + +For canonical source-of-truth, the CVs are at: +- `base/oleg_proskurin_ai_engineer_fullstack_cv.md` +- `base/oleg_proskurin_fullstack_techlead_cv.md` + +## Step 3 — p1 (process here) + +Pull p1 channels from inbox and walk through every kept message: + +```bash +jq '.channels | to_entries | map(select(.value.priority == "p1")) | from_entries' tracking/telegram_inbox.json +``` + +For each kept message, classify: +- **Apply** — fits role + culture + comp. Note: company, role, link, why-fit (1 line). +- **Maybe** — fits role/stack but unclear comp or stretch culture. Note same fields + the uncertainty. +- **Drop** — fails targeting. Don't list, don't explain. + +p1 should be small enough (~12K tokens currently) to do in main session without context strain. + +## Step 4 — p2 and p3 (delegate to Haiku subagent) + +Spawn a subagent via the Agent tool. Use `general-purpose` agent type with **Haiku model** for cost/speed. + +**Critical:** the subagent does not see this conversation. The prompt must be self-contained. + +Template (fill `` with `p2`, or pass both p2 and p3 in one call): + +``` +Agent({ + description: "Triage Telegram inbox ", + subagent_type: "general-purpose", + model: "haiku", + prompt: ` +Triage job postings from Oleg's Telegram inbox. + +Read tracking/telegram_inbox.json and filter to channels with priority "": + + jq '.channels | to_entries | map(select(.value.priority == "")) | from_entries' tracking/telegram_inbox.json + +Oleg's targeting (apply strictly): + +[paste the "Step 2 — Oleg's targeting" section verbatim] + +For each kept message that is a REAL vacancy (not a resume/CV digest entry, not a chat-room message, not a market-intel essay), decide if it fits the targeting. + +Return ONLY the shortlist as JSON. Reject everything else silently — no commentary on rejected items. + +Shortlist schema: +[ + { + "channel": "", + "id": , + "link": "", + "title": "", + "company": "", + "stack": [""], + "comp": "", + "remote": true | false | "unclear", + "fit": "apply" | "maybe", + "why": "" + } +] + +If there are zero matches, return []. +Do not paraphrase or summarize messages — quote the original title verbatim and just extract structured fields. +Do not include personal opinions or formatting commentary. +` +}) +``` + +Run subagents in parallel where possible (one for p2, one for p3 in the same message). + +## Step 5 — Aggregate and present + +Combine p1 finalists (from Step 3) with subagent shortlists (Step 4). Dedupe by `(company, title)` pair when possible. + +Present grouped output to Oleg, e.g.: + +``` +🎯 APPLY (N) +- jaabz_com #10233 — AI-Native Full Stack Developer @ Geeky Tech — TS/React/Python, Fully Remote, B2B SaaS +- dev_connectablejobs #2039 — Full-Stack Engineer @ VOYGR — AI-native, $4-7k, Remote, founders ex-Google +- ... + +🤔 MAYBE (N) +- jsspeak #58062 — AI Engineer (Python & Node.js) Senior @ Eshe App — 300-400k₽, RU+BY citizenship only — fit but comp lower +- ... +``` + +## Step 6 — Suggest next step + +After the shortlist, offer to: +- Append apply-list to `tracking/applications.md` (one row each, status `to-apply`). +- For 1-2 top picks, switch to the tailoring workflow (see main CLAUDE.md "Workflow 2 — Tailor CV"). + +## Notes + +- **Don't auto-add to `applications.md`** without explicit confirmation — Oleg curates that file. +- **Don't auto-refetch.** If the inbox is stale, ask first. +- **State cursor advances on every fetch.** A skill run only reads the existing inbox — it doesn't trigger a new fetch unless explicitly requested. +- **Skip P3 by default if user says "quick triage"** — p3 is mostly market-intel and dead channels, low ROI. diff --git a/.gitignore b/.gitignore index 697ded8..4296bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,8 @@ output/html/ # Local Chrome profile (contains cookies, logins — never commit) .chrome/ + +# Telegram fetch outputs — per-machine cursor + per-run scratch +tracking/telegram_state.json +tracking/telegram_inbox.json +tracking/telegram_pending_channels.json diff --git a/CLAUDE.md b/CLAUDE.md index 4b7dda6..3001402 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -74,6 +74,21 @@ A read-only investigation never needs permission; this rule only applies to muta --- +## Workflow — finding vacancies in Telegram + +Operational details — file layout, scripts, filter schema, priority rubric, new-channel triage procedure — live in **[`tracking/CLAUDE.md`](tracking/CLAUDE.md)**. Read that first before doing any vacancy-related Telegram work. + +Quick orientation: +- **Source of subscriptions**: Telegram folder "Jobs" (id=6), curated manually by Oleg. Never mirror its membership to a repo file. +- **Pipeline**: `scripts/list_telegram_channels.py` → `scripts/fetch_telegram_jobs.py` (chainable via stdin). +- **Curated config**: [`tracking/telegram_channels.json`](tracking/telegram_channels.json) — per-channel `lang`, `priority` (`p1`/`p2`/`p3`), and filter (`include` / `exclude`). +- **Output**: [`tracking/telegram_inbox.json`](tracking/telegram_inbox.json) — filtered messages, overwritten each run. +- **Trigger**: manual only — run when Oleg explicitly asks (e.g. "забери свежее из Jobs"). No background polling. + +After a run, triage `telegram_inbox.json` and promote promising postings to [`tracking/applications.md`](tracking/applications.md). For the triage itself, use the **[`triage-jobs` skill](.claude/skills/triage-jobs/SKILL.md)** — it stratifies by priority and delegates p2/p3 (the bulk) to a Haiku subagent so the main session stays lean. + +--- + ## Folder layout ``` diff --git a/scripts/fetch_telegram_jobs.py b/scripts/fetch_telegram_jobs.py new file mode 100755 index 0000000..3139390 --- /dev/null +++ b/scripts/fetch_telegram_jobs.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "telethon>=1.42", +# "python-dotenv>=1.0", +# ] +# /// +""" +Fetch new messages from a list of Telegram channels (job-vacancy feeds), +filter them per-channel by curated keywords, and surface untriaged ("new") +channels for keyword decisions. + +Inputs: + - channel usernames/ids as positional args, OR `-` to read a JSON array from stdin + - .env in the project root (TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_SESSION_STRING) + - tracking/telegram_state.json — per-channel last_message_id (created if missing) + - tracking/telegram_channels.json — per-channel curated metadata (lang, priority) + and filter (include/exclude). See tracking/CLAUDE.md. + +Outputs: + - tracking/telegram_inbox.json — kept messages (filtered for known + channels, unfiltered for new ones) + - tracking/telegram_pending_channels.json — keyword-frequency scan for new + channels (only when present; + deleted otherwise) + - tracking/telegram_state.json — updated with newest seen ids + - stdout — summary, with prominent "NEW CHANNELS" line when pending exist +""" + +import asyncio +import json +import os +import sys +from datetime import datetime, timedelta, timezone +from pathlib import Path + +from dotenv import load_dotenv +from telethon import TelegramClient +from telethon.sessions import StringSession + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +STATE_FILE = PROJECT_ROOT / "tracking" / "telegram_state.json" +OUTPUT_FILE = PROJECT_ROOT / "tracking" / "telegram_inbox.json" +CHANNELS_FILE = PROJECT_ROOT / "tracking" / "telegram_channels.json" +PENDING_FILE = PROJECT_ROOT / "tracking" / "telegram_pending_channels.json" + +# First time we see a channel, how far back to look +DEFAULT_LOOKBACK_DAYS = 30 +# Hard cap per channel per run, to avoid runaway on busy channels +MAX_PER_CHANNEL = 500 + + +def load_credentials(): + load_dotenv(PROJECT_ROOT / ".env") + try: + api_id = int(os.environ["TELEGRAM_API_ID"]) + api_hash = os.environ["TELEGRAM_API_HASH"] + session = os.environ["TELEGRAM_SESSION_STRING"] + except KeyError as e: + sys.exit(f"missing env var: {e}. check .env in project root.") + return api_id, api_hash, session + + +def load_json(path, default): + if path.exists(): + return json.loads(path.read_text()) + return default + + +def save_json(path, data): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n") + + +def channel_key(value): + if isinstance(value, int): + return value + if value.lstrip("-").isdigit(): + return int(value) + return value.lstrip("@") + + +def normalize_filter(spec): + """ + Normalize a channel filter spec to {"groups": [...], "excludes": [...]}. + + Accepted input forms: + [] -> trust-all (no filter) + ["a", "b"] -> single OR-group + [["a","b"], ["c","d"]] -> AND of OR-groups + [["a","b"], "c"] -> scalars promoted + {"include": , + "exclude": ["x", "y"]} -> include + negative filter + + Filter semantics: message passes if (no exclude keyword matches) AND + (every include-group has at least one match). Empty include = trust-all + (only exclude is applied). + """ + if isinstance(spec, dict): + include = spec.get("include", []) + excludes = list(spec.get("exclude", [])) + else: + include = spec + excludes = [] + + if not include: + groups = [] + elif any(isinstance(item, list) for item in include): + groups = [item if isinstance(item, list) else [item] for item in include] + else: + groups = [include] + + return {"groups": groups, "excludes": excludes} + + +def msg_passes_filter(text, spec): + f = normalize_filter(spec) + t = text.lower() + # Any exclude hit → reject immediately + if any(ex.lower() in t for ex in f["excludes"]): + return False + if not f["groups"]: + return True # trust-all (no positive constraints) + return all( + any(kw.lower() in t for kw in group) if group else True + for group in f["groups"] + ) + + +def flatten_keywords(keywords_config): + """Deduped union of every keyword (include + exclude) across every channel.""" + out = set() + for spec in keywords_config.values(): + f = normalize_filter(spec) + for group in f["groups"]: + out.update(group) + out.update(f["excludes"]) + return sorted(out) + + +def keyword_frequencies(messages, all_keywords): + """Count case-insensitive substring occurrences of each keyword across messages.""" + counts = {} + texts_lower = [m["text"].lower() for m in messages] + for kw in all_keywords: + kw_lower = kw.lower() + n = sum(1 for t in texts_lower if kw_lower in t) + if n > 0: + counts[kw] = n + return dict(sorted(counts.items(), key=lambda kv: -kv[1])) + + +async def fetch_channel(client, key, last_id, lookback_dt): + messages = [] + max_id = last_id or 0 + max_date = None + truncated = False + + kwargs = {"limit": MAX_PER_CHANNEL} + if last_id: + kwargs["min_id"] = last_id + + count = 0 + async for msg in client.iter_messages(key, **kwargs): + count += 1 + # Anchor cursor to newest id we encounter even if we discard the + # message (too old, no content). iter_messages yields newest-first. + if msg.id > max_id: + max_id = msg.id + max_date = msg.date + if last_id is None and msg.date < lookback_dt: + break + text = (msg.message or "").strip() + if not text and not msg.media: + continue + messages.append({ + "id": msg.id, + "date": msg.date.isoformat(), + "text": text, + "has_media": bool(msg.media), + "link": f"https://t.me/{key}/{msg.id}" if isinstance(key, str) else None, + }) + if count >= MAX_PER_CHANNEL: + truncated = True + break + + messages.reverse() # chronological + return messages, max_id, max_date, truncated + + +async def main(channels): + api_id, api_hash, session = load_credentials() + state = load_json(STATE_FILE, {}) + channels_config = load_json(CHANNELS_FILE, {}) + all_keywords = flatten_keywords(channels_config) + lookback_dt = datetime.now(timezone.utc) - timedelta(days=DEFAULT_LOOKBACK_DAYS) + + output = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "lookback_days_for_new_channels": DEFAULT_LOOKBACK_DAYS, + "channels": {}, + "total_in_inbox": 0, + } + pending = {} + + async with TelegramClient(StringSession(session), api_id, api_hash) as client: + for raw in channels: + key = channel_key(raw) + ch_state = state.get(str(raw), {}) + last_id = ch_state.get("last_message_id") + first_run = last_id is None + + ch_cfg = channels_config.get(str(raw)) + ch_lang = ch_cfg.get("lang") if isinstance(ch_cfg, dict) else None + ch_priority = ch_cfg.get("priority") if isinstance(ch_cfg, dict) else None + + try: + msgs, max_id, max_date, truncated = await fetch_channel( + client, key, last_id, lookback_dt + ) + except Exception as e: + output["channels"][str(raw)] = { + "lang": ch_lang, + "priority": ch_priority, + "error": f"{type(e).__name__}: {e}", + "messages": [], + } + continue + if ch_cfg is None: + # New / untriaged channel — pass everything through unfiltered, + # but log keyword-frequency scan for the agent to triage. + kept = msgs + filter_mode = "unfiltered (new channel — not yet curated)" + pending[str(raw)] = { + "messages_scanned": len(msgs), + "first_run": first_run, + "truncated": truncated, + "keyword_counts_from_other_channels": ( + keyword_frequencies(msgs, all_keywords) if all_keywords else {} + ), + "note": ( + "decide lang, priority (p1/p2/p3) and keywords (existing or " + f"new) for this channel; add an entry to " + f"{CHANNELS_FILE.relative_to(PROJECT_ROOT)} — see tracking/CLAUDE.md" + ), + } + else: + f = normalize_filter(ch_cfg) + has_constraints = bool(f["groups"] or f["excludes"]) + kept = [m for m in msgs if msg_passes_filter(m["text"], ch_cfg)] + filter_mode = ( + f"filtered (groups={len(f['groups'])}, excludes={len(f['excludes'])})" + if has_constraints + else "trust-all (no filter)" + ) + + output["channels"][str(raw)] = { + "lang": ch_lang, + "priority": ch_priority, + "seen": len(msgs), + "kept": len(kept), + "filtered_out": len(msgs) - len(kept), + "first_run": first_run, + "truncated": truncated, + "filter_mode": filter_mode, + "messages": kept, + } + output["total_in_inbox"] += len(kept) + + if max_id > (last_id or 0): + state[str(raw)] = { + "last_message_id": max_id, + "last_seen_date": max_date.isoformat() if max_date else None, + } + + save_json(STATE_FILE, state) + save_json(OUTPUT_FILE, output) + if pending: + save_json(PENDING_FILE, pending) + elif PENDING_FILE.exists(): + PENDING_FILE.unlink() + + rel_inbox = OUTPUT_FILE.relative_to(PROJECT_ROOT) + rel_pending = PENDING_FILE.relative_to(PROJECT_ROOT) + rel_chans = CHANNELS_FILE.relative_to(PROJECT_ROOT) + + print( + f"{output['total_in_inbox']} messages in inbox " + f"(from {len(channels)} channels) → {rel_inbox}" + ) + if pending: + names = ", ".join(pending.keys()) + print(f"NEW CHANNELS ({len(pending)}): {names}") + print(f" keyword-frequency scan → {rel_pending}") + print(f" curate lang/priority/keywords in {rel_chans} (see tracking/CLAUDE.md)") + + +def parse_args(): + if len(sys.argv) < 2: + sys.exit( + "usage: fetch_telegram_jobs.py [ ...]\n" + " fetch_telegram_jobs.py - (read JSON array of channels from stdin)" + ) + if sys.argv[1] == "-": + return json.loads(sys.stdin.read()) + return sys.argv[1:] + + +if __name__ == "__main__": + asyncio.run(main(parse_args())) diff --git a/scripts/list_telegram_channels.py b/scripts/list_telegram_channels.py new file mode 100755 index 0000000..3e8d81d --- /dev/null +++ b/scripts/list_telegram_channels.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "telethon>=1.42", +# "python-dotenv>=1.0", +# ] +# /// +""" +List broadcast channels and supergroups in a named Telegram folder (default: "Jobs"). + +Output: JSON array of usernames (falling back to numeric id for private channels) on stdout. + +Pipe directly into the fetch script: + list_telegram_channels.py | fetch_telegram_jobs.py - +""" + +import asyncio +import json +import os +import sys +from pathlib import Path + +from dotenv import load_dotenv +from telethon import TelegramClient +from telethon.sessions import StringSession +from telethon.tl.functions.messages import GetDialogFiltersRequest +from telethon.tl.types import Channel, InputPeerChannel + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_FOLDER = "Jobs" + + +def filter_title(f): + """DialogFilter.title is str on older Telethon, TextWithEntities on newer.""" + t = getattr(f, "title", None) + if t is None: + return "" + return t if isinstance(t, str) else getattr(t, "text", "") + + +async def main(folder_title): + load_dotenv(PROJECT_ROOT / ".env") + try: + api_id = int(os.environ["TELEGRAM_API_ID"]) + api_hash = os.environ["TELEGRAM_API_HASH"] + session = os.environ["TELEGRAM_SESSION_STRING"] + except KeyError as e: + sys.exit(f"missing env var: {e}. check .env in project root.") + + async with TelegramClient(StringSession(session), api_id, api_hash) as client: + result = await client(GetDialogFiltersRequest()) + + target = None + for f in result.filters: + if filter_title(f) == folder_title: + target = f + break + if target is None: + sys.exit(f"folder {folder_title!r} not found") + + # Combine pinned + included peers — both are part of the folder + wanted_channel_ids = set() + for peer in list(getattr(target, "pinned_peers", []) or []) + list(target.include_peers): + if isinstance(peer, InputPeerChannel): + wanted_channel_ids.add(peer.channel_id) + + # Resolve channel entities to extract usernames + usernames = [] + async for dialog in client.iter_dialogs(): + ent = dialog.entity + if not isinstance(ent, Channel): + continue + if ent.id not in wanted_channel_ids: + continue + usernames.append(ent.username or str(-1000000000000 - ent.id)) + + print(json.dumps(usernames, ensure_ascii=False)) + + +if __name__ == "__main__": + folder = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_FOLDER + asyncio.run(main(folder)) diff --git a/tracking/CLAUDE.md b/tracking/CLAUDE.md new file mode 100644 index 0000000..1c5667b --- /dev/null +++ b/tracking/CLAUDE.md @@ -0,0 +1,227 @@ +# tracking/ — Job-search tracking and Telegram vacancy pipeline + +This folder is the operational layer of the job search: the curated channel registry, the live cursor for incremental Telegram pulls, the staging area for messages awaiting triage, and the long-form logs of applications and outreach. + +If you (Claude) are about to do anything related to "find vacancies in Telegram", "scan job channels", "what's new in Jobs", "triage a new channel", or similar — this is the file to read first. The main `CLAUDE.md` references it from the Telegram workflow section. + +--- + +## Files at a glance + +| File | Purpose | In git? | +|---|---|---| +| `telegram_channels.json` | **Curated source of truth** — per-channel `lang`, `priority`, and filter (`include`/`exclude`). Tunable by hand. | ✅ committed | +| `telegram_state.json` | Per-machine cursor — `last_message_id` and `last_seen_date` per channel. Regenerated automatically. | ❌ gitignored | +| `telegram_inbox.json` | Output of the last fetch run — kept messages only, per channel, with `lang`/`priority` injected. Overwritten each run. | ❌ gitignored | +| `telegram_pending_channels.json` | Generated only when the last run had **new** (untriaged) channels — keyword-frequency scan to bootstrap their curation. Deleted on the next run if no pending. | ❌ gitignored | +| `applications.md` | One row per application — manually maintained, append-only. | ✅ committed | +| `outreach.md` | Cold messages, recruiter pings, follow-ups. One row per touchpoint. | ✅ committed | + +--- + +## Running the pipeline + +Two scripts, chainable. Always run from project root. + +```bash +~/.local/bin/uv run scripts/list_telegram_channels.py \ + | ~/.local/bin/uv run scripts/fetch_telegram_jobs.py - +``` + +**Step 1 — `scripts/list_telegram_channels.py`**: reads the live "Jobs" folder from Telegram via Telethon and emits a JSON array of channel usernames (or numeric ids for private channels) to stdout. Always run fresh — Oleg curates the folder manually and adds new channels regularly. + +**Step 2 — `scripts/fetch_telegram_jobs.py`**: pulls new messages per channel, applies the per-channel filter, and writes results to `telegram_inbox.json`. Accepts channels as positional args or as a JSON array on stdin (`-`). + +### Constants in the fetch script + +- `DEFAULT_LOOKBACK_DAYS = 30` — first-time lookback window for new channels (no cursor yet). +- `MAX_PER_CHANNEL = 500` — hard cap on raw messages fetched per channel per run. A channel that posts >500 messages in the lookback window gets `truncated: true` in the output and we silently miss the tail. Tune per scenario (see "Truncation" below). + +### Trigger + +Vacancy scans run **only when Oleg explicitly asks** (e.g. "забери свежее из Jobs", "что нового в каналах"). No background polling. + +--- + +## telegram_channels.json — schema + +Each entry is keyed by `username` (or numeric id for private channels) and is an object: + +```jsonc +{ + "": { + "lang": "ru" | "en" | "...", // required + "priority": "p1" | "p2" | "p3", // required + "include": , // optional — absent = trust-all (no positive constraint) + "exclude": ["kw1", "kw2", ...] // optional — absent = no negative constraint + } +} +``` + +A message **passes the filter** when: +1. **No** `exclude` keyword (case-insensitive substring) is present, AND +2. Every `include` OR-group contributes at least one match. + +If both `include` and `exclude` are absent → **trust-all** (every message passes; useful for low-volume personal/digest channels). + +### `include` — the four forms + +| Form | Semantics | Example | +|---|---|---| +| `[]` or absent | trust-all | _(no constraint)_ | +| `["a", "b"]` | flat OR — at least one matches | `["javascript", "react"]` | +| `[["a", "b"], ["c", "d"]]` | AND of OR-groups — every group needs ≥1 hit | `[["#vacancy","#вакансия"], ["#remote","#удаленка"]]` | +| `[["a","b"], "c"]` | scalars auto-promoted to single-item groups | same as `[["a","b"], ["c"]]` | + +### `exclude` — flat list + +If **any** keyword in `exclude` appears in the text → the message is **rejected**, even if `include` would have matched. Used to drop wrong-stack postings from generic channels. + +Standard Oleg-stack excludes for jobs feeds: +```json +["kafka", "golang", "kotlin", "android", "swift"] +``` + +For *_jobs channels with hashtag-based filters, add resume excludes too: +```json +["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] +``` + +### Pitfalls + +- **Case-insensitive substring matching**, no word boundaries. `"go"` matches "going" / "Goldbelt" / "google" — that's why we use `"golang"` instead. Same trap for `"java"` (matches "javascript"); use `" java "` with spaces, or `"#java "` for hashtag form. For multi-word excludes, pad: `" rust "`, `" ios "`. +- **`react native`** in `exclude` would also block `"react native"` mentions in fullstack postings. Prefer excluding `kotlin`/`android`/`swift`/`flutter` to block mobile, and only block React Native when the channel is mobile-only. +- The same keyword can appear in `include` for one channel and `exclude` for another — they're per-channel, independent. + +--- + +## Priority levels + +Set on every channel. Assignment is judged **by the best vacancy seen in a fresh fetch** for that channel — not by volume or hashtag density. + +| Level | Meaning | Triage attention | +|---|---|---| +| **p1** | Very relevant — strong stack hits **and** global-remote culture. Posts that Oleg would actually apply to. | Read every kept message. | +| **p2** | Stack OK but culture is internal market (Russian RUB/CIS-only roles), or culture OK but salary band typically misses Oleg's threshold (US-only with low pay, Netherlands on-site). Worth periodic scanning — occasional gems. Market-intel channels (recruiter content) live here too. | Skim, dive into interesting headlines. | +| **p3** | Wrong stack (mobile-native, devops, QA), off-market (Nigeria with ₦ salaries, Netherlands junior on-site), pure chat/noise, founder lifestyle blogs, or dead channels (0 messages in lookback). Subscribed for completeness — Oleg may pivot or want occasional glance. | Glance only on request. | + +When triaging the inbox, sort/group by `priority` first, then by `lang`. + +--- + +## Language codes + +Free-form short ISO-style codes — pick what fits: +- `ru` — Russian (most curated channels) +- `en` — English +- `mixed` — multi-language channel, when you can't pick a primary +- `nl`, `de`, etc. — for regional boards + +This isn't strict; it's a hint for triage attention (Oleg reads ru and en fluently; everything else needs translation overhead). + +--- + +## Triaging a new channel — full procedure + +A "new" channel = one that's in the Telegram "Jobs" folder but doesn't have an entry in `telegram_channels.json`. Detected automatically: the fetch script puts its raw messages into `telegram_inbox.json` unfiltered and writes a keyword-frequency scan to `telegram_pending_channels.json`. + +Steps to graduate a channel out of pending: + +1. **Read `telegram_pending_channels.json`** — for each new channel: + - `keyword_counts_from_other_channels`: how often every existing keyword (include + exclude across all channels) appears in this channel's recent messages. Quick signal of stack and posting style. + - `messages_scanned`, `first_run`, `truncated`: volume context. +2. **Open `telegram_inbox.json`** and sample 3–8 messages from this channel directly: + ```bash + jq -r '.channels[""].messages[:5] | .[] | "── \(.date[0:16])\n\(.text[0:400])\n"' tracking/telegram_inbox.json + ``` + Look for: hashtag patterns, language, post structure (single role vs digest vs chat), recurring noise types. +3. **Decide `lang` and `priority`** using the rubrics above. Base priority on the **best** vacancy in the sample, not the average. +4. **Decide filter shape:** + - Channel posts proper `#vacancy`/`#вакансия` + `#remote`/`#удаленка` tags → use the standard hashtag AND-of-OR + Oleg-stack excludes (most *_jobs channels). + - Channel posts vacancy text without consistent hashtags → use **positive stack include** (`["javascript", "typescript", "react", ...]`) + the same Oleg-stack excludes. + - Channel is low-volume personal/curated content (recruiter musings, market intel) where the value is the whole post → **trust-all** (omit `include` and `exclude`). + - Channel is a digest that mixes resumes and vacancies (e.g. `javascript_jobs_feed`) → trust-all is usually the right call; filtering `резюме` would drop the whole digest. + - Channel is mostly noise/wrong stack but worth keeping subscribed → strict positive filter, accept that most runs will return 0. +5. **Add the entry to `telegram_channels.json`**. JSON is hand-edited; keep entries ordered by `priority` then alphabetically for readability. +6. **Rerun the chain.** The channel transitions out of pending. The `telegram_pending_channels.json` file is automatically deleted when no pending channels remain. +7. **Validate** — sample the new `kept` messages and verify nothing wrong is passing or being dropped. If the filter is wrong, edit and rerun (state cursor is fine to keep — incremental fetches re-filter only new messages, so to validate the filter on history you may want to clear state for that channel: `jq 'del(.)' tracking/telegram_state.json`). + +### Sanity-check existing filters + +When tuning, always: +- Sample `kept` messages — are they all valid for Oleg? +- For channels with `kept == 0`, **verify with an unfiltered pull** (temporarily remove the channel's entry and rerun for it alone) that nothing legitimate is being thrown away. Don't assume 0 = correct without checking. + +--- + +## Truncation — when the 500-message cap bites + +A channel with `"truncated": true` in `telegram_inbox.json` had >500 raw messages in the lookback window. We see the most-recent 500 and silently miss the tail (older portion of the window). + +For `*_jobs` Russian channels truncation typically means we covered 1–10 days of a 30-day window. Strict hashtag filters then leave 1–7 kept messages — but the **missed** older messages could contain relevant vacancies. + +Options: +- Bump `MAX_PER_CHANNEL` globally (more API calls, longer run). +- Narrow lookback for the busy channel (no per-channel knob today — would require a code change). +- Tune the filter to be stricter so fewer raw messages need processing — only useful if the filter applies at the API level, which substring filters don't. + +For now, keep the cap and accept the tail loss for very busy channels; relax only when a specific channel justifies it. + +--- + +## Output of a fetch run + +`telegram_inbox.json` structure (overwritten each run): + +```jsonc +{ + "generated_at": "2026-06-02T...", + "lookback_days_for_new_channels": 30, + "total_in_inbox": , + "channels": { + "": { + "lang": "ru" | "en" | null, // null = channel is still "new" / pending + "priority": "p1" | "p2" | "p3" | null, + "seen": , // raw messages fetched + "kept": , // after filter + "filtered_out": , + "first_run": , // no prior state cursor + "truncated": , // hit MAX_PER_CHANNEL + "filter_mode": "filtered (...)" | "trust-all (no filter)" | "unfiltered (new channel — not yet curated)", + "messages": [ + { "id": , "date": "", "text": "...", "has_media": , "link": "https://t.me/.../id" } + ] + } + } +} +``` + +Messages are **chronological per channel** (oldest first within each channel). + +### Useful jq probes + +```bash +# Per-channel summary sorted by kept desc +jq -r '.channels | to_entries | sort_by(.value.kept) | reverse | .[] + | "\(.key) → kept \(.value.kept)/\(.value.seen) [\(.value.priority // "—")/\(.value.lang // "—")]"' \ + tracking/telegram_inbox.json + +# All p1 kept messages +jq '.channels | to_entries | map(select(.value.priority == "p1")) | from_entries' \ + tracking/telegram_inbox.json + +# Truncated channels with depth analysis +jq -r '.channels | to_entries | map(select(.value.truncated)) + | .[] | "\(.key): kept \(.value.kept)/\(.value.seen), priority \(.value.priority)"' \ + tracking/telegram_inbox.json +``` + +--- + +## After triage + +Promising postings → append a row to `applications.md`. Don't accumulate a "seen but skipped" log — the state cursor already prevents re-reading. + +For outreach (cold DMs, recruiter conversations) → `outreach.md`, one row per touchpoint. + +If Oleg unsubscribes from a channel in Telegram, it disappears from the live folder list, the next run won't fetch it, and its entry in `telegram_channels.json` becomes dead weight. Periodic cleanup is fine but not required — dead entries cost ~150 bytes. diff --git a/tracking/telegram_channels.json b/tracking/telegram_channels.json new file mode 100644 index 0000000..a8b52d7 --- /dev/null +++ b/tracking/telegram_channels.json @@ -0,0 +1,144 @@ +{ + "jaabz_com": { + "lang": "en", + "priority": "p1", + "include": [["javascript", "typescript", "#typescript", "#javascript", " react", "#react", "node.js", "nodejs", "#nodejs", "next.js", "nextjs", "#nextjs", "fullstack", "full-stack", "ai engineer", " llm", "tech lead", "techlead", "staff engineer"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " java ", "#java ", "ruby", "#ruby", "php", "#php", "designer", "marketing", "sales", "recruit", "#dotnet", " c# ", "#rust", "qa engineer"] + }, + "dev_connectablejobs": { + "lang": "ru", + "priority": "p1", + "include": [["javascript", "typescript", " react", "node.js", "nodejs", "fullstack", "full-stack", "tech lead", "techlead", "ai engineer", "ml engineer", " llm", "mlops"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " java "] + }, + + "zarubezhom_jobs": { + "lang": "ru", + "priority": "p2", + "include": [["javascript", "typescript", " react", "node.js", "nodejs", "fullstack", "full-stack", "tech lead", "techlead", "ai engineer", "ml engineer", " llm"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " java ", "designer", "дизайнер"] + }, + "jsspeak": { + "lang": "ru", + "priority": "p2", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "huntermikevolkov": { + "lang": "ru", + "priority": "p2" + }, + "budny_lucky_hunter": { + "lang": "ru", + "priority": "p2" + }, + "-1002137052673": { + "lang": "en", + "priority": "p2", + "include": [["typescript", "javascript", " react", "node.js", "nodejs", "fullstack", "full-stack", "ai engineer", "tech lead", "techlead"]], + "exclude": ["mental health", "marketing", "sales", "designer", "manager", "recruit", "kafka", "golang", "kotlin", "android", "swift"] + }, + "javascript_jobs_feed": { + "lang": "ru", + "priority": "p2" + }, + "Jobs_global_startups": { + "lang": "en", + "priority": "p2", + "include": [["typescript", "javascript", " react", "node.js", "nodejs", "fullstack", "full-stack", "tech lead", "techlead", "ai engineer", " llm"]], + "exclude": ["devops engineer", "qa engineer", "manual test", " junior", "kafka", "golang", "kotlin", "android", "swift", " java "] + }, + "javascript_jobs": { + "lang": "ru", + "priority": "p2", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "nodejs_jobs": { + "lang": "ru", + "priority": "p2", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "projects_jobs": { + "lang": "ru", + "priority": "p2", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "javascriptjobjs": { + "lang": "ru", + "priority": "p2", + "include": [["javascript", "typescript", "react", "node.js", "nodejs", "next.js", "nextjs", "fullstack", "full-stack", "frontend", "front-end"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " rust ", "stepik", "курс ", " junior", "trainee", "intern", "стажер", "#резюме"] + }, + "agile_jobs": { + "lang": "ru", + "priority": "p2", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + + "devops_jobs": { + "lang": "ru", + "priority": "p3", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "mobile_jobs": { + "lang": "ru", + "priority": "p3", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "mobile_vacancies": { + "lang": "ru", + "priority": "p3", + "include": [["react native", "fullstack", "full-stack"]], + "exclude": ["kotlin", "android", "swift", " ios ", "ios developer", "flutter"] + }, + "devitjobs_nl": { + "lang": "en", + "priority": "p3", + "include": [["typescript", "javascript", " react", "node.js", "nodejs", "fullstack", "full-stack", "tech lead", "techlead"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " java ", " junior", "trainee", "intern", "ontwikkelaar"] + }, + "techjobsworld": { + "lang": "en", + "priority": "p3", + "include": [["remote", "удаленка"]], + "exclude": ["lagos", "nigeria", "abuja", "ikoyi", "onsite", "on-site", "on site", " sales ", "recruit", "manual test", "qa engineer", "kafka", "golang", "kotlin", "android", "swift", " java ", "ruby"] + }, + "gogetajob": { + "lang": "ru", + "priority": "p3", + "include": [["#vacancy", "#вакансия"], ["#remote", "#удаленка"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", "#резюме", "#resume", "#cv", "#ищуработу"] + }, + "remotejobss": { + "lang": "en", + "priority": "p3", + "include": [["typescript", "javascript", " react", "node.js", "nodejs", "fullstack", "full-stack", "ai engineer", "tech lead", "techlead"]], + "exclude": ["kafka", "golang", "kotlin", "android", "swift", " java ", "ruby", "php", "designer", "marketing", "sales", "customer service", "support agent", "recruit", "manual test", "qa engineer"] + }, + "bez_investorov": { + "lang": "ru", + "priority": "p3" + }, + "newworld_2088": { + "lang": "ru", + "priority": "p3" + }, + "Remote_Software_Developer_Jobs": { + "lang": "en", + "priority": "p3" + }, + "RemotiveJobs_All_Others": { + "lang": "en", + "priority": "p3" + }, + "remote_jobs_today": { + "lang": "en", + "priority": "p3" + } +}