Add user-agent blocking for dealing with webscrapers

Change-Id: I9c26791a9f96bb2e87f0f427f7ff4adcb46a2b9e
diff --git a/README.md b/README.md
index 707378c..750cd43 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,9 @@
 # NOTE: this will cause failures if you haven't previously run the the acme
 # role, so override it when that role isn't being used.
 acme_username: "acme"
+
+# block specific user agents (webscrapers)
+blocked_user_agents: "DotBot|MJ12bot|SemrushBot|PetalBot|AhrefsBot"
 ```
 
 The `vhosts` is used to define virtualhosts on the nginx server. This is a list
diff --git a/defaults/main.yml b/defaults/main.yml
index 70e2731..14ee763 100644
--- a/defaults/main.yml
+++ b/defaults/main.yml
@@ -39,3 +39,6 @@
 
 # authentication scope dict to create htpasswd files - see README.md
 auth_scopes: {}
+
+# block specific user agents (webscrapers)
+blocked_user_agents: "DotBot|MJ12bot|SemrushBot|PetalBot|AhrefsBot"
diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml
index 5e2e15c..fbe12ca 100644
--- a/molecule/default/verify.yml
+++ b/molecule/default/verify.yml
@@ -28,6 +28,14 @@
       register: webpage
       failed_when: "'This file is served from static.example.com' not in webpage.content"
 
+    - name: Test that bad user agents are blocked with 403
+      uri:
+        url: http://127.0.0.1/
+        headers:
+          Host: "static.example.com"
+          User-Agent: "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"
+        status_code: 403
+
     - name: Test that the static site is the default site
       uri:
         url: http://127.0.0.1/
diff --git a/templates/vhost.conf.j2 b/templates/vhost.conf.j2
index 94ea8da..142e74a 100644
--- a/templates/vhost.conf.j2
+++ b/templates/vhost.conf.j2
@@ -79,6 +79,11 @@
   access_log {{ nginx_log_dir }}/{{ item.name }}_access.log;
   error_log  {{ nginx_log_dir }}/{{ item.name }}_error.log;
 
+  # user agent (webscraper) blocks
+  if ($http_user_agent ~* {{ blocked_user_agents }}) {
+     return 403;
+  }
+
 {% if item.extra_config is defined and item.extra_config %}
   # extra config
   {{ item.extra_config | indent(2) }}