Add user-agent blocking for dealing with webscrapers
Change-Id: I9c26791a9f96bb2e87f0f427f7ff4adcb46a2b9e
diff --git a/README.md b/README.md
index 707378c..750cd43 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,9 @@
# NOTE: this will cause failures if you haven't previously run the the acme
# role, so override it when that role isn't being used.
acme_username: "acme"
+
+# block specific user agents (webscrapers)
+blocked_user_agents: "DotBot|MJ12bot|SemrushBot|PetalBot|AhrefsBot"
```
The `vhosts` is used to define virtualhosts on the nginx server. This is a list
diff --git a/defaults/main.yml b/defaults/main.yml
index 70e2731..14ee763 100644
--- a/defaults/main.yml
+++ b/defaults/main.yml
@@ -39,3 +39,6 @@
# authentication scope dict to create htpasswd files - see README.md
auth_scopes: {}
+
+# block specific user agents (webscrapers)
+blocked_user_agents: "DotBot|MJ12bot|SemrushBot|PetalBot|AhrefsBot"
diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml
index 5e2e15c..fbe12ca 100644
--- a/molecule/default/verify.yml
+++ b/molecule/default/verify.yml
@@ -28,6 +28,14 @@
register: webpage
failed_when: "'This file is served from static.example.com' not in webpage.content"
+ - name: Test that bad user agents are blocked with 403
+ uri:
+ url: http://127.0.0.1/
+ headers:
+ Host: "static.example.com"
+ User-Agent: "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"
+ status_code: 403
+
- name: Test that the static site is the default site
uri:
url: http://127.0.0.1/
diff --git a/templates/vhost.conf.j2 b/templates/vhost.conf.j2
index 94ea8da..142e74a 100644
--- a/templates/vhost.conf.j2
+++ b/templates/vhost.conf.j2
@@ -79,6 +79,11 @@
access_log {{ nginx_log_dir }}/{{ item.name }}_access.log;
error_log {{ nginx_log_dir }}/{{ item.name }}_error.log;
+ # user agent (webscraper) blocks
+ if ($http_user_agent ~* {{ blocked_user_agents }}) {
+ return 403;
+ }
+
{% if item.extra_config is defined and item.extra_config %}
# extra config
{{ item.extra_config | indent(2) }}