chore(default-config): allowlist common crawl (#753)

This may seem strange, but allowlisting common crawl means that scrapers have less incentive to scrape because they can just grab the data from common crawl instead of scraping it again.
2025-07-04 00:10:45 +00:00 · 2025-07-04 00:10:45 +00:00 · 7c0996448a
commit 7c0996448a
parent d7a758f805
5 changed files with 19 additions and 3 deletions
--- a/data/crawlers/_allow-good.yaml
+++ b/data/crawlers/_allow-good.yaml
@ -6,4 +6,5 @@
 - import: (data)/crawlers/internet-archive.yaml
 - import: (data)/crawlers/kagibot.yaml
 - import: (data)/crawlers/marginalia.yaml
- import: (data)/crawlers/mojeekbot.yaml
+- import: (data)/crawlers/mojeekbot.yaml
+- import: (data)/crawlers/commoncrawl.yaml
--- a/data/crawlers/commoncrawl.yaml
+++ b/data/crawlers/commoncrawl.yaml
@ -0,0 +1,12 @@
+- name: common-crawl
+  user_agent_regex: CCBot
+  action: ALLOW
+  # https://index.commoncrawl.org/ccbot.json
+  remote_addresses:
+    [
+      "2600:1f28:365:80b0::/60",
+      "18.97.9.168/29",
+      "18.97.14.80/29",
+      "18.97.14.88/30",
+      "98.85.178.216/32",
+    ]