# SandDiary — robots.txt
# Updated: 2026-05-18
#
# Disallows mirror the sitemap-exclude list in astro.config.mjs and the
# noindex propagation in BaseLayout. Three independent gates so a single
# regression doesn't leak private flows into search results.

User-agent: *
Allow: /
Disallow: /auth/
Disallow: /payment/

# Naver (Yeti) — Korea's dominant search engine. Naver Search Advisor
# explicitly checks for an entry naming Yeti even when User-agent: *
# Allow: / is in place. Explicit allowlist + sitemap declaration here
# resolves the "robots.txt 가 존재하지 않습니다" advisor warning.
User-agent: Yeti
Allow: /
Disallow: /auth/
Disallow: /payment/

# Daum/Kakao crawler — same policy as Naver.
User-agent: Daumoa
Allow: /
Disallow: /auth/
Disallow: /payment/
# Block query-param noise that can produce duplicate-content reports.
# `*post=` is the legacy explore detail share param. The canonical
# location for an individual post is now `/p/{id}` (static page);
# anything reached via `?post=` is the listing-with-overlay variant,
# which duplicates the same content. /p/{id} carries the authoritative
# canonical for each post.
Disallow: /*?*post=

# /u/{handle} profile pages are not yet implemented; block crawlers
# from speculatively indexing the URL space until they ship.
Disallow: /u/

# Known abusive / SEO-spammer scrapers. Block hard rather than just
# slow them with crawl-delay — the bots that respect robots will obey,
# the ones that don't were never going to anyway.
User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

# ───────────────────────────────────────────────────────────────────
# AI crawler policy — "Instagram-style"
# ───────────────────────────────────────────────────────────────────
# Two distinct classes of AI bots, treated differently:
#
#   TRAINING bots (Disallow) — scrape pages to train future models.
#     Once content lands in a frozen model, we can't get it back. So
#     we block these by default. Decision: 2026-05-19, user-confirmed.
#
#   ANSWER bots (Allow via User-agent: *) — fetch pages live when a
#     user asks the LLM a real-time question. The model NEVER stores
#     the page; the answer just cites SandDiary as a source. This is
#     functionally "AI-SERP" — same value-equation as Googlebot, but
#     with smaller upside per impression. We want SandDiary cited.
#
# Auditable list (sourced from official docs of each provider):
#   TRAINING (BLOCKED):
#     GPTBot           — OpenAI model training
#     anthropic-ai     — Anthropic legacy training crawler
#     ClaudeBot        — Anthropic current training crawler
#     CCBot            — Common Crawl (corpus feeds many LLMs)
#     Google-Extended  — Gemini training opt-out signal
#     Amazonbot        — Amazon LLM training
#     Bytespider       — ByteDance/Doubao training
#     Diffbot          — KG/training scraper
#     omgili / FacebookBot training mode
#
#   ANSWER (ALLOWED via User-agent: * default):
#     ChatGPT-User     — OpenAI real-time browsing (citations)
#     OAI-SearchBot    — ChatGPT search results
#     PerplexityBot    — Perplexity live answers (citations)
#     Perplexity-User  — Perplexity follow-up fetches
#     Claude-Web       — Anthropic real-time web use
#     Google-Extended  Snippets disabled but Googlebot still crawls,
#                      so SandDiary still appears in Google SERP.
#
# This split matches the policy major creator-economy sites
# (Instagram, NYT, Reddit-API-partners) have settled on.

User-agent: GPTBot
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: Amazonbot
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: Diffbot
Disallow: /

# (ChatGPT-User / OAI-SearchBot / PerplexityBot / Perplexity-User /
#  Claude-Web are NOT listed here — they inherit "Allow: /" from the
#  User-agent: * stanza at the top of the file.)

# ───────────────────────────────────────────────────────────────────
# Sitemap declarations — single canonical entry point.
# ───────────────────────────────────────────────────────────────────
# Why ONE Sitemap directive instead of three:
#   1. /sitemap.xml is the master index, referencing every child
#      with its own lastmod. Crawlers discover the whole tree from
#      this one URL.
#   2. Multiple Sitemap directives are valid (RFC), but crawlers
#      handle a single index more reliably — fewer race conditions
#      when one child sitemap rotates faster than another.
#   3. /sitemap-index.xml and /sitemap-posts.xml stay accessible
#      for direct submission to Search Console (some publishers
#      prefer submitting children directly), they just aren't the
#      crawler-discovery entry point.
Sitemap: https://sanddiary.com/sitemap.xml