Apache - mwicat/personal GitHub Wiki
mkdir logs
parallel rsync -avz '{}:/var/log/apache2/other_vhosts_access.log*' logs/{}/ < hosts
... or run parallel acquisition script
parallel-ssh -I -o results -H host1 hostname < script.sh
grep LogFormat -R /etc/apache2/
apt-get install apachetop
apachetop -f access.log.1576022400
<Location "/server-info">
SetHandler server-info
Require ip 127.0.0.1
Require ip 123.123.123.0/24
</Location>
Log formats (https://httpd.apache.org/docs/2.4/logs.html)
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" combined
LogFormat "%h %l %u %t \"%r\" %>s %b" common
LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %T %v" full
LogFormat "%{X-Forwarded-For}i %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" customlogger
Truncate every 1 day
CustomLog "|/usr/bin/rotatelogs /var/tmp/apache.log 86400" combined
Truncate every 100M
CustomLog "|/usr/sbin/rotatelogs /var/tmp/apache.log 100M" "%h %l %u %t \"%r\" %>s %b"
fmt_common = "%h %l %u %t \"%r\" %s %b"
fmt_combined = "%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\""
https://pypi.org/project/httpagentparser/
sudo pip install pyyaml ua-parser user-agents
from user_agents import parse as ua_parse
ua = str(ua_parse(ua_str))
https://gist.github.com/mwicat/53638c63a31bae22f634ff7c60536a1e
import sys
import apache_log_parser
fmt = "%v %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
line_parser = apache_log_parser.make_parser(fmt)
rows = (line_parser(line) for line in sys.stdin)
/etc/init.d/apache2/reload
awk '($9 !~ /200/) { print $9, $7 }'
awk '{print $7}' < access.log
awk '{print $7}' < access.log | grep -o '^[^?]*' | sort | uniq -c
sudo pip install apache-log-parser
import sys
import apache_log_parser
import pandas as pd
fmt = '%h %l %u %t "%r" %s %b "%{Referer}i" "%{User-agent}i"'
line_parser = apache_log_parser.make_parser(fmt)
rows = (line_parser(line) for line in sys.stdin)
df = pd.DataFrame(rows)
df.index = df.time_received_datetimeobj
df.groupby(df.date.dt.year)
df.to_msgpack('storage.msg')
{
'remote_logname': '-',
'remote_user': '-',
'request_first_line': 'GET /test?x=7 HTTP/1.1',
'request_header_referer': '-',
'request_header_user_agent': 'Mozilla/5.0 Mobile Safari/537.36 '
'OPR/44.6.2246.127414',
'request_header_user_agent__browser__family': 'Opera Mobile',
'request_header_user_agent__browser__version_string': '44.6.2246',
'request_header_user_agent__is_mobile': True,
'request_header_user_agent__os__family': 'Other',
'request_header_user_agent__os__version_string': '',
'request_header_x_forwarded_for': '192.168.0.1',
'request_http_ver': '1.1',
'request_method': 'GET',
'request_url': '/test?x=7',
'request_url_fragment': '',
'request_url_hostname': None,
'request_url_netloc': '',
'request_url_password': None,
'request_url_path': '/test',
'request_url_port': None,
'request_url_query': 'x=7',
'request_url_query_dict': {'x': ['7']},
'request_url_query_list': [('x', '7')],
'request_url_query_simple_dict': {'x': '7'},
'request_url_scheme': '',
'request_url_username': None,
'response_bytes_clf': '328',
'status': '200',
'time_received': '[08/Feb/2019:11:59:59 +0000]',
'time_received_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59),
'time_received_isoformat': '2019-02-08T11:59:59',
'time_received_tz_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59, tzinfo='0000'),
'time_received_tz_isoformat': '2019-02-08T11:59:59+00:00',
'time_received_utc_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59, tzinfo='0000'),
'time_received_utc_isoformat': '2019-02-08T11:59:59+00:00',
'time_us': '301269'
}
'%a' # Remote IP-address
'%A' # Local IP-address
'%B' # Size of response in bytes, excluding HTTP headers.
'%b' # Size of response in bytes, excluding HTTP headers. In CLF format, i.e. a '-' rather than a 0 when no bytes are sent.
'%D' # The time taken to serve the request, in microseconds.
'%f' # Filename
'%h' # Remote host
'%H' # The request protocol
'%k' # Number of keepalive requests handled on this connection. Interesting if KeepAlive is being used, so that, for example, a '1' means the first keepalive request after the initial one, '2' the second, etc...; otherwise this is always 0 (indicating the initial request). Available in versions 2.2.11 and later.
'%l' # Remote logname (from identd, if supplied). This will return a dash unless mod_ident is present and IdentityCheck is set On.
'%m' # The request method
'%p' # The canonical port of the server serving the request
'%P' # The process ID of the child that serviced the request.
'%q' # The query string (prepended with a ? if a query string exists, otherwise an empty string)
'%r' # First line of request
'%R' # The handler generating the response (if any).
'%s' # Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last.
'%t' # Time the request was received (standard english format)
'%T' # The time taken to serve the request, in seconds.
'%u' # Remote user (from auth; may be bogus if return status (%s) is 401)
'%U' # The URL path requested, not including any query string.
'%v' # The canonical ServerName of the server serving the request.
'%V' # The server name according to the UseCanonicalName setting.
'%X' # Connection status when response is completed:
# X = connection aborted before the response completed.
# + = connection may be kept alive after the response is sent.
# - = connection will be closed after the response is sent.
# (This directive was %c in late versions of Apache 1.3, but this conflicted with the historical ssl %{var}c syntax.)
'%I' # Bytes received, including request and headers, cannot be zero. You need to enable mod_logio to use this.
'%O' # Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this.
'%\{User-Agent\}i' # Special case of below, for matching just user agent
'%\{[^\}]+?\}i' # The contents of Foobar: header line(s) in the request sent to the server. Changes made by other modules (e.g. mod_headers) affect this. If you're interested in what the request header was prior to when most modules would have modified it, use mod_setenvif to copy the header into an internal environment variable and log that value with the %\{VARNAME}e described above.
'%\{[^\}]+?\}C' # The contents of cookie Foobar in the request sent to the server. Only version 0 cookies are fully supported.
'%\{[^\}]+?\}e' # The contents of the environment variable FOOBAR
'%\{[^\}]+?\}n' # The contents of note Foobar from another module.
'%\{[^\}]+?\}o' # The contents of Foobar: header line(s) in the reply.
'%\{[^\}]+?\}p' # The canonical port of the server serving the request or the server's actual port or the client's actual port. Valid formats are canonical, local, or remote.
'%\{[^\}]+?\}P' # The process ID or thread id of the child that serviced the request. Valid formats are pid, tid, and hextid. hextid requires APR 1.2.0 or higher.
'%\{[^\}]+?\}t' # The time, in the form given by format, which should be in strftime(3) format. (potentially localized)
'%\{[^\}]+?\}x' # Extension value, e.g. mod_ssl protocol and cipher
sudo pip install apache-log-parser
load_data.py:
import apache_log_parser
from pyspark.sql import Row
fn = '/var/tmp/apache.log'
fmt = '%{X-Forwarded-For}i %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %{Host}i'
line_parser = apache_log_parser.make_parser(fmt)
def parse(line):
d = line_parser(line)
r = {
'date': d['time_received_datetimeobj'].replace(tzinfo=None),
'method': d['request_method'],
'url': d['request_url'],
'user_agent': d['request_header_user_agent'],
'status': d['status'],
}
return Row(**r)
rdd = sc.textFile(fn).map(parse)
df = rdd.toDF()
df.cache().count()
query_data.py:
query = """
select year(date) as year, month(date) as month, count(*) as visits
from logs
group by year(date), month(date)
order by year(date), month(date)
"""
df.registerTempTable('logs')
visits_by_month = sqlContext.sql(query).toPandas()
print(visits_by_month.to_string(index=False))
save data:
df.toPandas().to_pickle("/var/tmp/parsed_logs.pkl")
$ pyspark
% run -i load_data.py
% run -i query_data.py
load pandas
import pandas as pd
df = pd.read_pickle("/var/tmp/parsed_logs.pkl")
RedirectMatch 500 "^/yourpath"
Cut except
RedirectMatch 404 "^/yourpath/(?!yourexcept)"
RewriteRule ^/yourpath https://www.yoursite.com/$1 [L,R=permanent]
L = last R = redirect (e.g. R=302, R=301) https://httpd.apache.org/docs/2.4/rewrite/flags.html