chore(default-config): allowlist common crawl (#753)
This may seem strange, but allowlisting common crawl means that scrapers have less incentive to scrape because they can just grab the data from common crawl instead of scraping it again.
This commit is contained in:
parent
d7a758f805
commit
7c0996448a
5 changed files with 19 additions and 3 deletions
|
|
@ -6,4 +6,5 @@
|
|||
- import: (data)/crawlers/internet-archive.yaml
|
||||
- import: (data)/crawlers/kagibot.yaml
|
||||
- import: (data)/crawlers/marginalia.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
||||
- import: (data)/crawlers/commoncrawl.yaml
|
||||
|
|
|
|||
12
data/crawlers/commoncrawl.yaml
Normal file
12
data/crawlers/commoncrawl.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
- name: common-crawl
|
||||
user_agent_regex: CCBot
|
||||
action: ALLOW
|
||||
# https://index.commoncrawl.org/ccbot.json
|
||||
remote_addresses:
|
||||
[
|
||||
"2600:1f28:365:80b0::/60",
|
||||
"18.97.9.168/29",
|
||||
"18.97.14.80/29",
|
||||
"18.97.14.88/30",
|
||||
"98.85.178.216/32",
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue