#!/usr/bin/perl -w
#
# logscan - Apache log analysis utility
# Apache Security, http://www.apachesecurity.net
# Copyright (C) 2004,2005 Ivan Ristic <ivanr@webkreator.com>
#
# $Id: logscan,v 1.4 2005/03/29 16:43:26 ivanr Exp $
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 2.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

# This script makes searching through access logs easier. It accepts
# three parameters on input: the name of the file to search, variable
# name, and a regular expression. It will open the log file, parse
# out each line into individual variables and print those lines where
# the variable matches the regular expression.
#
# Example (lists requests with status that begins with 5):
#
#   logscan access_log status ^5
#
# Assuming the following access log line:
#
# 82.70.94.182 - - [09/Dec/2004:17:41:44 +0000] "GET /upload.php?p=1 HTTP/1.1" 500 636
#
# the script will produce the following variables:
#
#   remote_ip            82.70.94.182
#   remote_username      -
#   username             -
#   date                 03/Nov/2004
#   time                 14:47:30
#   gmt_offset           +0000
#   request_method       GET
#   request_uri          /upload.php?p=1
#   script_name          /upload.php
#   query_string         p=1
#   protocol             HTTP/1.1
#   status               500
#   bytes_out            636
#   the_rest             (the invisible newline character at the end)
#
#   time_mday            03
#   time_mon             11
#   time_year            2004
#   time_hour            14
#   time_min             47
#   time_sec             30
#   time_epoch           1102614104
#
# If your logs are in the combined log format, then uncomment one line
# below (look for the comment "UNCOMMENT THE LINE...") to enable the
# script to recognise two more variables:
#
#   referer              http://www.apachesecurity.net/
#   user_agent           Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0
#
# Variables request_uri and query_string are normalized before
# the regular expression is applied. At this time the following
# is true:
#
#  * Comparison is not case sensitive
#  * URL-encoded characters are decoded

# TODO Add more normalization options
#
# TODO Parse individual parameters
# 
# TODO Option to read a list of regular expression from a file
#
# TODO Command line switch to print known variable names (i.e. print help)
#
# TODO Test mode, to just parse the first entry and print all variables
#
# TODO Support for multiple (named) log formats, support for custom
#      log formats
#
# TODO Add regex negation
#
# TODO Option to read input from stdin

use Time::Local;

%months = (
    "Jan" => 0,
    "Feb" => 1,
    "Mar" => 2,
    "Apr" => 3,
    "May" => 4,
    "Jun" => 5,
    "Jul" => 6,
    "Aug" => 7,
    "Sep" => 8, 
    "Oct" => 9,
    "Nov" => 10,
    "Dec" => 11
);

# -- log parsing regular expression

# remote host, remote username, local username
$logline_regex = "^(\\S+)\\ (\\S+)\\ (\\S+)";
# date, time, and gmt offset
$logline_regex .= "\\ \\[([^:]+):(\\d+:\\d+:\\d+)\\ ([^\\]]+)\\]";
# request method + request uri + protocol (as one field)
$logline_regex .= "\\ \"(.*)\"";
# status, bytes out
$logline_regex .= "\\ (\\d+)\\ (\\S+)";

# referer, user_agent
# UNCOMMENT THE LINE BELOW FOR "combined" LOG FORMAT
# $logline_regex .= "\\ \"(.*)\"\\ \"(.*)\"";

# the rest (always keep this part of the regex)
$logline_regex .= "(.*)\$";

$therequest_regex = "(\\S+)\\ (.*?)\\ (\\S+)";

sub parse_logline {
    $_ = shift;

    my %request = ();
    $request{"invalid"} = 0;

    my @parsed_logline = /$logline_regex/x;
    if (@parsed_logline == 0) {
        return (0,0);
    }

    (
        $request{"remote_ip"},
        $request{"remote_username"},
        $request{"username"},
        $request{"date"},
        $request{"time"},
        $request{"gmt_offset"},
        $request{"the_request"},
        $request{"status"},
        $request{"bytes_out"},
        $request{"referer"},
        $request{"user_agent"},
        $request{"the_rest"}
    ) = @parsed_logline;

    # parse the request line
    $_ = $request{"the_request"};
    my @parsed_therequest = /$therequest_regex/x;
    if (@parsed_therequest == 0) {
        $request{"invalid"} = "1";
        $request{"request_method"} = "";
        $request{"request_uri"} = "";
        $request{"protocol"} = "";
    } else {
        (
            $request{"request_method"},
            $request{"request_uri"},
            $request{"protocol"}
        ) = @parsed_therequest;
    }

    if ($request{"bytes_out"} eq "-") {
        $request{"bytes_out"} = 0;
    }

    # print "date=" . $request{"date"} . "\n";
    (
        $request{"time_mday"},
        $request{"time_mon"},
        $request{"time_year"}
    ) = ( $request{"date"} =~ m/^(\d+)\/(\S+)\/(\d+)/x );

    # print "time=" . $request{"time"} . "\n";
    (
        $request{"time_hour"},
        $request{"time_min"},
        $request{"time_sec"}
    ) = ( $request{"time"} =~ m/(\d+):(\d+):(\d+)/x );

    $request{"time_mon"} = $months{$request{"time_mon"}};

    $request{"time_epoch"} = timelocal(
        $request{"time_sec"},
        $request{"time_min"},
        $request{"time_hour"},
        $request{"time_mday"},
        $request{"time_mon"},
        $request{"time_year"}
    );

    # print %request;

    my $offset = index($request{"request_uri"}, "?");
    if ($offset != -1) {
        $request{"script_name"} = substr($request{"request_uri"}, 0, $offset);
        $request{"query_string"} = substr($request{"request_uri"}, $offset + 1);
    } else {
        $request{"script_name"} = $request{"request_uri"};
        $request{"query_string"} = "";
    }

    $request{"request_uri"} =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;
    $request{"query_string"} =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;

    return %request;
}

# -- main ---------------------------------------

$USAGE = "Usage: logscan <logfile> <fieldname> <regex>\n";

if (@ARGV != 3) {
    print $USAGE;
    exit;
}

my($filename, $field_name, $regex) = @ARGV;

open(FILE, $filename) || die "Failed to open $filename";
while($line = <FILE>) {
    # ignore empty lines
    next if ($line =~ m/^$/);

    %request = parse_logline($line);
    if (!defined($request{0})) {
        if (!defined($request{$field_name})) {
            print("Unknown field name: $field_name\n");
            exit();
        }
        if ($request{$field_name} =~ m/$regex/i) {
            print $line;
        }
    } else {
        print STDERR "Failed to parse line: " . $line;
    }
}
close(FILE);
