#!/usr/bin/perl -w
#
# apache-monitor - store Apache usage stats into a RRD file
# Apache Security, http://www.apachesecurity.net
# Copyright (C) 2004 Ivan Ristic <ivanr@webkreator.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

# This script makes searching through access logs easier. It accepts
# three parameters on input: the name of the file to search, variable
# name, and a regular expression. It will open the log file, parse
# out each line into individual variables and print those lines where
# the variable matches the regular expression.
#
# Example (lists requests with status that begins with 5):
#
#   logscan access_log status ^5
#
# Assuming the following access log line:
#
# 82.70.94.182 - - [09/Dec/2004:17:41:44 +0000] "GET /upload.php?p=1 HTTP/1.1" 500 636
#
# the script will produce the following variables:
#
#   remote_ip            82.70.94.182
#   remote_username      -
#   username             -
#   date                 03/Nov/2004
#   time                 14:47:30
#   gmt_offset           +0000
#   request_method       GET
#   request_uri          /upload.php?p=1
#   script_name          /upload.php
#   query_string         p=1
#   protocol             HTTP/1.1
#   status               500
#   bytes_out            636
#   the_rest             (the invisible newline character at the end)
#
#   time_mday            03
#   time_mon             11
#   time_year            2004
#   time_hour            14
#   time_min             47
#   time_sec             30
#   time_epoch           1102614104
#
# If your logs are in the combined log format, then ucomment one line
# below (look for the comment "UNCOMMENT THE LINE...") to enable the
# script to recognise two more variables:
#
#   referer              http://www.apachesecurity.net/
#   user_agent           Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0
#
# Variables request_uri and query_string are normalized before
# the regular expression is applied. At this time the following
# is true:
#
#  * Comparison is not case sensitive
#  * URL-encoded characters are decoded

# TODO Add more normalization options
#
# TODO Parse individual parameters
# 
# TODO Option to read a list of regular expression from a file

use Time::Local;

%months = (
    "Jan" => 0,
    "Feb" => 1,
    "Mar" => 2,
    "Apr" => 3,
    "May" => 4,
    "Jun" => 5,
    "Jul" => 6,
    "Aug" => 7,
    "Sep" => 8, 
    "Oct" => 9,
    "Nov" => 10,
    "Dec" => 11
);

# -- log parsing regular expression

# remote host, remote username, local username
$fmt = "^(\\S+)\\ (\\S+)\\ (\\S+)";
# date, time, and gmt offset
$fmt .= "\\ \\[([^:]+):(\\d+:\\d+:\\d+)\\ ([^\\]]+)\\]";
# request method, request uri
$fmt .= "\\ \"(\\S+)\\ (.*?)\\ ";
# protocol, status, bytes out
$fmt .= "(\\S+)\\ (\\d+)\\ (\\S+)";

# referer, user_agent
# UNCOMMENT THE LINE BELOW FOR "combined" LOG FORMAT
# $fmt .= "\\ \"(.*)\"\\ \"(.*)\"";

# the rest (always keep this part of the regex)
$fmt .= "(.*)\$";

sub parse_logline {
    shift(@_);

    my @parsed = /$fmt/x;
    if (@parsed == 0) {
        die "Invalid line: " . $_;
    }

    (
        $request{"remote_ip"},
        $request{"remote_username"},
        $request{"username"},
        $request{"date"},
        $request{"time"},
        $request{"gmt_offset"},
        $request{"request_method"},
        $request{"request_uri"},
        $request{"protocol"},
        $request{"status"},
        $request{"bytes_out"},
        $request{"referer"},
        $request{"user_agent"},
        $request{"the_rest"}
    ) = @parsed;

    if ($request{"bytes_out"} eq "-") {
        $request{"bytes_out"} = 0;
    }

    # print "date=" . $request{"date"} . "\n";
    (
        $request{"time_mday"},
        $request{"time_mon"},
        $request{"time_year"}
    ) = ( $request{"date"} =~ m/^(\d+)\/(\S+)\/(\d+)/x );

    # print "time=" . $request{"time"} . "\n";
    (
        $request{"time_hour"},
        $request{"time_min"},
        $request{"time_sec"}
    ) = ( $request{"time"} =~ m/(\d+):(\d+):(\d+)/x );

    $request{"time_mon"} = $months{$request{"time_mon"}};

    $request{"time_epoch"} = timelocal(
        $request{"time_sec"},
        $request{"time_min"},
        $request{"time_hour"},
        $request{"time_mday"},
        $request{"time_mon"},
        $request{"time_year"}
    );

    print %request;

    my $offset = index($request{"request_uri"}, "?");
    if ($offset != -1) {
        $request{"script_name"} = substr($request{"request_uri"}, 0, $offset);
        $request{"query_string"} = substr($request{"request_uri"}, $offset + 1);
    } else {
        $request{"script_name"} = $request{"request_uri"};
        $request{"query_string"} = "";
    }

    $request{"request_uri"} =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;
    $request{"query_string"} =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;

    return %request;
}

# -- main ---------------------------------------

if (@ARGV != 3) {
    print "Usage: logscan <logfile> <fieldname> <regex>\n";
    exit;
}

my($filename, $field_name, $regex) = @ARGV;

open(FILE, $filename) || die "Failed to open $filename";
while(<FILE>) {
    # ignore empty lines
    next if ($_ =~ m/^$/);

    %request = parse_logline($_);

    if ($request{$field_name} =~ m/$regex/i) {
        print $_;
    }
}
close(FILE);
