Apache - mwicat/personal GitHub Wiki

Gather logs

mkdir logs
parallel rsync -avz '{}:/var/log/apache2/other_vhosts_access.log*' logs/{}/ < hosts

... or run parallel acquisition script

parallel-ssh -I -o results -H host1 hostname < script.sh

Get configured log format

grep LogFormat -R /etc/apache2/

Monitor logs live

apt-get install apachetop
apachetop -f access.log.1576022400

Access control

<Location "/server-info">
    SetHandler server-info
    Require ip 127.0.0.1
    Require ip 123.123.123.0/24
</Location>
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" combined
LogFormat "%h %l %u %t \"%r\" %>s %b" common
LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %T %v" full
LogFormat "%{X-Forwarded-For}i %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" customlogger

Custom logs

Truncate every 1 day

CustomLog "|/usr/bin/rotatelogs /var/tmp/apache.log 86400" combined

Truncate every 100M

CustomLog "|/usr/sbin/rotatelogs /var/tmp/apache.log 100M" "%h %l %u %t \"%r\" %>s %b"

Log formats

fmt_common = "%h %l %u %t  \"%r\" %s %b"
fmt_combined = "%h %l %u %t  \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\""

Parse user agent

https://pypi.org/project/httpagentparser/

sudo pip install pyyaml ua-parser user-agents
from user_agents import parse as ua_parse
ua = str(ua_parse(ua_str))

Parse user agent top 10 visiting browsers

https://gist.github.com/mwicat/53638c63a31bae22f634ff7c60536a1e

Parse logs

import sys

import apache_log_parser

fmt = "%v %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""
line_parser = apache_log_parser.make_parser(fmt)

rows = (line_parser(line) for line in sys.stdin)

Reload

/etc/init.d/apache2/reload

Grep for errors

awk '($9 !~ /200/) { print $9, $7 }'

Grep accessed resources

awk '{print $7}' < access.log

Stats for accessed resources

awk '{print $7}' < access.log | grep -o '^[^?]*' | sort | uniq -c

Log to DataFrame

sudo pip install apache-log-parser
import sys

import apache_log_parser
import pandas as pd

fmt = '%h %l %u %t  "%r" %s %b "%{Referer}i" "%{User-agent}i"'
line_parser = apache_log_parser.make_parser(fmt)

rows = (line_parser(line) for line in sys.stdin)
df = pd.DataFrame(rows)
df.index = df.time_received_datetimeobj
df.groupby(df.date.dt.year)
df.to_msgpack('storage.msg')
{
 'remote_logname': '-',
 'remote_user': '-',
 'request_first_line': 'GET /test?x=7 HTTP/1.1',
 'request_header_referer': '-',
 'request_header_user_agent': 'Mozilla/5.0 Mobile Safari/537.36 '
                              'OPR/44.6.2246.127414',
 'request_header_user_agent__browser__family': 'Opera Mobile',
 'request_header_user_agent__browser__version_string': '44.6.2246',
 'request_header_user_agent__is_mobile': True,
 'request_header_user_agent__os__family': 'Other',
 'request_header_user_agent__os__version_string': '',
 'request_header_x_forwarded_for': '192.168.0.1',
 'request_http_ver': '1.1',
 'request_method': 'GET',
 'request_url': '/test?x=7',
 'request_url_fragment': '',
 'request_url_hostname': None,
 'request_url_netloc': '',
 'request_url_password': None,
 'request_url_path': '/test',
 'request_url_port': None,
 'request_url_query': 'x=7',
 'request_url_query_dict': {'x': ['7']},
 'request_url_query_list': [('x', '7')],
 'request_url_query_simple_dict': {'x': '7'},
 'request_url_scheme': '',
 'request_url_username': None,
 'response_bytes_clf': '328',
 'status': '200',
 'time_received': '[08/Feb/2019:11:59:59 +0000]',
 'time_received_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59),
 'time_received_isoformat': '2019-02-08T11:59:59',
 'time_received_tz_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59, tzinfo='0000'),
 'time_received_tz_isoformat': '2019-02-08T11:59:59+00:00',
 'time_received_utc_datetimeobj': datetime.datetime(2019, 2, 8, 11, 59, 59, tzinfo='0000'),
 'time_received_utc_isoformat': '2019-02-08T11:59:59+00:00',
 'time_us': '301269'
}
    '%a'  #	Remote IP-address
    '%A'  #	Local IP-address
    '%B'  #	Size of response in bytes, excluding HTTP headers.
    '%b'  #	Size of response in bytes, excluding HTTP headers. In CLF format, i.e. a '-' rather than a 0 when no bytes are sent.
    '%D'  #	The time taken to serve the request, in microseconds.
    '%f'  #	Filename
    '%h'  #	Remote host
    '%H'  #	The request protocol
    '%k'  #	Number of keepalive requests handled on this connection. Interesting if KeepAlive is being used, so that, for example, a '1' means the first keepalive request after the initial one, '2' the second, etc...; otherwise this is always 0 (indicating the initial request). Available in versions 2.2.11 and later.
    '%l'  #	Remote logname (from identd, if supplied). This will return a dash unless mod_ident is present and IdentityCheck is set On.
    '%m'  #	The request method
    '%p'  #	The canonical port of the server serving the request
    '%P'  #	The process ID of the child that serviced the request.
    '%q'  #	The query string (prepended with a ? if a query string exists, otherwise an empty string)
    '%r'  #	First line of request
    '%R'  #	The handler generating the response (if any).
    '%s'  #	Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last.
    '%t'  #	Time the request was received (standard english format)
    '%T'  #	The time taken to serve the request, in seconds.
    '%u'  #	Remote user (from auth; may be bogus if return status (%s) is 401)
    '%U'  #	The URL path requested, not including any query string.
    '%v'  #	The canonical ServerName of the server serving the request.
    '%V'  #	The server name according to the UseCanonicalName setting.
    '%X'  #	Connection status when response is completed:
              # X =	connection aborted before the response completed.
              # + =	connection may be kept alive after the response is sent.
              # - =	connection will be closed after the response is sent.
              # (This directive was %c in late versions of Apache 1.3, but this conflicted with the historical ssl %{var}c syntax.)
    '%I'  #	Bytes received, including request and headers, cannot be zero. You need to enable mod_logio to use this.
    '%O'  #	Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this.
    
    '%\{User-Agent\}i'  # Special case of below, for matching just user agent
    '%\{[^\}]+?\}i'  #	The contents of Foobar: header line(s) in the request sent to the server. Changes made by other modules (e.g. mod_headers) affect this. If you're interested in what the request header was prior to when most modules would have modified it, use mod_setenvif to copy the header into an internal environment variable and log that value with the %\{VARNAME}e described above.
    
    '%\{[^\}]+?\}C'  #	The contents of cookie Foobar in the request sent to the server. Only version 0 cookies are fully supported.
    '%\{[^\}]+?\}e'  #	The contents of the environment variable FOOBAR
    '%\{[^\}]+?\}n'  #	The contents of note Foobar from another module.
    '%\{[^\}]+?\}o'  #	The contents of Foobar: header line(s) in the reply.
    '%\{[^\}]+?\}p'  #	The canonical port of the server serving the request or the server's actual port or the client's actual port. Valid formats are canonical, local, or remote.
    '%\{[^\}]+?\}P'  #	The process ID or thread id of the child that serviced the request. Valid formats are pid, tid, and hextid. hextid requires APR 1.2.0 or higher.
    '%\{[^\}]+?\}t'  #	The time, in the form given by format, which should be in strftime(3) format. (potentially localized)
    '%\{[^\}]+?\}x'  # Extension value, e.g. mod_ssl protocol and cipher

Parse with pyspark

sudo pip install apache-log-parser

load_data.py:

import apache_log_parser
from pyspark.sql import Row


fn = '/var/tmp/apache.log'

fmt = '%{X-Forwarded-For}i %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %{Host}i'

line_parser = apache_log_parser.make_parser(fmt)


def parse(line):
    d = line_parser(line)
    r = {
        'date': d['time_received_datetimeobj'].replace(tzinfo=None),
        'method': d['request_method'],
        'url': d['request_url'],
        'user_agent': d['request_header_user_agent'],
        'status': d['status'],
    }
    return Row(**r)


rdd = sc.textFile(fn).map(parse)
df = rdd.toDF()

df.cache().count()

query_data.py:

query = """
select year(date) as year, month(date) as month, count(*) as visits
from logs
group by year(date), month(date)
order by year(date), month(date)
"""

df.registerTempTable('logs')

visits_by_month = sqlContext.sql(query).toPandas()
print(visits_by_month.to_string(index=False))

save data:

df.toPandas().to_pickle("/var/tmp/parsed_logs.pkl")
$ pyspark
% run -i load_data.py
% run -i query_data.py

load pandas

import pandas as pd
df = pd.read_pickle("/var/tmp/parsed_logs.pkl")

Cut requests

RedirectMatch 500 "^/yourpath"

Cut except

RedirectMatch 404 "^/yourpath/(?!yourexcept)"

Redirect

RewriteRule ^/yourpath         https://www.yoursite.com/$1 [L,R=permanent]

L = last R = redirect (e.g. R=302, R=301) https://httpd.apache.org/docs/2.4/rewrite/flags.html

⚠️ **GitHub.com Fallback** ⚠️