#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell
# $Id: tv_grab_fr,v 1.0 2003/01/31 23:04:20 epaepa Exp $

=head1 NAME

tv_grab_fr - Grab TV listings for France.

=head1 SYNOPSIS
To configure: tv_grab_fr --configure [--config-file FILE]
To grab listing : tv_grab_fr [--help] [--output FILE] [--days N] [--offset N] [--quiet] [--slow]

=head1 DESCRIPTION

Output TV listings for several channels available in France (Hertzian, Cable/satellite, Canal+ Sat, TPS).  The data
comes from tf1.guidetele.com.  The default is to grab as many days as
possible from the current day onwards. By default the program description are not
downloaded, so if you want description and ratings, you should active the --longlisting option.

B<--configure> Grab channels informations from the website and ask for channel type and names.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days starting from today, rather than as many as
possible. Due to the website organization, the speed is exactly the same, whatever the number of days is until you activate the --slow option.
For example, --days 1 will take the same amount of time than --days 7. So by default, the --days option is set to 7.

B<--offset N> start grabbing N days from today, rather than starting
today.  N may be negative. Due to the website organization, N cannot be inferior to -1.

B<--slow> Gets additionnal information from the website, like program description, reviews and credits

B<--quiet> suppress the progress messages normally written to standard
error.

=head1 SEE ALSO

L<xmltv(5)>

=head1 AUTHOR

Sylvain Fabre, centraladmin@lahiette.com

=cut

# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get France television listings in XMLTV format
To configure: tv_grab_fr --configure [--config-file FILE]
To grab listing : tv_grab_fr [--help] [--output FILE] [--days N] [--offset N] [--quiet] [--slow]
END
  ;

# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

use strict;
use XMLTV::Version '$Id$';
use Getopt::Long;
use LWP::Simple ();
use LWP::UserAgent;
use HTML::TreeBuilder;
use HTML::Entities; # parse entities
use IO::File;
use URI;
use Date::Manip;
use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::Mode;
use XMLTV::Config_file;
use XMLTV::Europe_TZ;


#***************************************************************************
# Main declarations
#***************************************************************************
my $GRID_BASE_URL = 'http://telepoche.guidetele.com/index.html?b=';
my $GRID_BY_CHANNEL = 'http://telepoche.guidetele.com/index_chaine.html?c=';
my $SHEET_URL = "http://tf1.guidetele.com/fiche.html?id=";
my $LANG = "fr";
my $MAX_STARS = 3;

my %GridType = ( "HERTZIENNE"     => "15305",
                 "CABLE/SAT"      => "15306",
                 "TPS"            => "15307",
                 "CANAL SAT"      => "15308",
                 "ETRANGERES"     => "15309" );

my %SummerWinterHours = ( "20040328020000" => "+0100",
                          "20041031030000" => "+0200",
                          "20050327020000" => "+0100",
                          "20051030030000" => "+0200",
                          "20060326020000" => "+0100",
                          "20061029030000" => "+0200" );

my @offsets = (24, 0, 4, 8, 12, 16, 20);

#***************************************************************************
# Options processing and global variables allocation
#***************************************************************************
XMLTV::Memoize::check_argv('get_page');
my ($opt_days,  $opt_help,  $opt_output,  $opt_offset,  $opt_quiet,  $opt_list_channels, $opt_config_file, $opt_configure, $opt_slow);
$opt_offset = 0;
$opt_quiet = 0;
$opt_days = 7;
$opt_output = '-'; # standard output
GetOptions('days=i'    => \$opt_days,
     'help'      => \$opt_help,
     'output=s'  => \$opt_output,
     'offset=i'  => \$opt_offset,
     'quiet'     => \$opt_quiet,
     'configure' => \$opt_configure,
     'list-channels' => \$opt_list_channels,
     'slow' => \$opt_slow
    )
  or usage(0);
die 'Number of days must not be negative'  if (defined $opt_days && $opt_days < 0);
die 'Cannot get more than one day before current day' if (defined $opt_offset && $opt_offset < -1);
usage(1) if $opt_help;

my %results;
my $lastdaysoffset = $opt_offset + $opt_days;

# Set the user agent for page download
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)'); # Identify as MSIE to avoid grabber detection at original website ...
$ua->timeout(600);

# Now detects if we are in configure mode
my $mode = XMLTV::Mode::mode('grab', # default
                        $opt_configure => 'configure',
                        $opt_list_channels => 'list-channels');

# File that stores which channels to download.
my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_fr', $opt_quiet);


#***************************************************************************
# Sub sections
#***************************************************************************
sub get_channels( $ );
sub process_channel_grid_page( $$$$ );

sub GetTimeOffset {
    my ($swdate, $mydate) = $_[0];
    foreach $swdate (sort keys %SummerWinterHours) {
        if ( (&Date_Cmp($mydate, $swdate )) < 0 ) {
            return $SummerWinterHours{$swdate};
        }
    }
}

# Get a page using this agent.
sub get_page( $ ) {
    my $url = shift;
    my $response = $ua->get($url);
    die $response->status_line if not $response->is_success;
    return $response->content;
}

sub xmlencoding {
    # encode for xml
    $_[0] =~ s/</&lt;/g;
    $_[0] =~ s/>/&gt;/g;
    $_[0] =~ s/&/\%26/g;
    return $_[0];
}

sub tidy {
    # clean bad characters from HTML
    for (my $s = shift) {
        tr/\205//d;
        tr/\222/''/;
        s/\234/oe/g;
        return $_;
    }
}

#***************************************************************************
# Configure mode
#***************************************************************************
if ($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
    open(CONF, ">$config_file") or die "Cannot write to $config_file: $!";

    # Get a list of available channels, according to the grid type
    my @gts = sort keys %GridType;
    my @gtnames = map { $GridType{$_} } @gts;
    my @gtqs = map { "Get channels type : $_?" } @gts;
    my @gtwant = askManyBooleanQuestions(1, @gtqs);

    foreach (@gts) {
        my $gtw = shift @gtwant;
        my $gtname = shift @gtnames;
        if ($gtw) {
            say  "Now getting channel list for : $_ \n";
            my %channels = get_channels( $gtname );
            die 'No channels could be found' if (scalar(keys(%channels)) == 0);
            # update $bar if Have_bar && not $opt_quiet;
            # Ask about each channel.
            my @chs = sort keys %channels;
            my @names = map { $channels{$_} } @chs;
            my @qs = map { "Add channel $_?" } @names;
            my @want = askManyBooleanQuestions(1, @qs);
            foreach (@chs) {
                my $w = shift @want;
                warn("Cannot read input, stopping channel questions"), last if not defined $w;
                # Print a config line, but comment it out if channel not wanted.
                print CONF '#' if not $w;
                my $name = shift @names;
                print CONF "channel $_ $name\n";
            }
        }
    }
    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");
    exit();
}

#***************************************************************************
# Check mode checking and get configuration file
#***************************************************************************
die if $mode ne 'grab' and $mode ne 'list-channels';

my @config_lines;
if ($mode eq 'grab') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}

#***************************************************************************
# Prepare the XMLTV writer object
#***************************************************************************
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}

$w_args{encoding} = 'ISO-8859-1';
my $writer = new XMLTV::Writer(%w_args);
$writer->start
  ({ 'source-info-url'     => 'http://telepoche.guidetele.com/',
     'source-data-url'     => 'http://telepoche.guidetele.com/',
     'generator-info-name' => 'XMLTV',
     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
   });

#***************************************************************************
# List channels only case
#***************************************************************************
if ($opt_list_channels) {
    # Get a list of available channels, according to the grid type
    my @gts = sort keys %GridType;
    my @gtnames = map { $GridType{$_} } @gts;
    my @gtqs = map { "List channels for grid : $_?" } @gts;
    my @gtwant = askManyBooleanQuestions(1, @gtqs);

    foreach (@gts) {
        my $gtw = shift @gtwant;
        my $gtname = shift @gtnames;
        if ($gtw) {
            say  "Now getting grid : $_ \n";
            my %channels = get_channels( $gtname );
            die 'no channels could be found' if (scalar(keys(%channels)) == 0);
            foreach my $ch_did (sort(keys %channels)) {
                my $ch_name = $channels{$ch_did};
                my $ch_xid = "$ch_did";
                $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ] });
            }
       }
     }
     $writer->end();
     exit();
}

#***************************************************************************
# Now the real grabbing work
#***************************************************************************
die if $mode ne 'grab';

#***************************************************************************
# Build the working list of channel name/channel id
#***************************************************************************
my (%channels, @channels, $chid, $chname);
my $line_num = 1;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;

    # Here we store the Channel name with the ID in the config file, as the XMLTV id = Website ID
    if (/^channel:?\s+(\S+)\s+([^\#]+)/) {
        $chid = $1;
        $chname = $2;
        $chname =~ s/\s*$//;
        push @channels, $chid;
        $channels{$chid} = $chname;
    }
}

#***************************************************************************
# Now process the days by getting the main grids.
#***************************************************************************
my @to_get;

# The website stores channel information by hour area for a whole week !
foreach $chid (sort @channels) {
    $chname = $channels{$chid};
    $writer->write_channel({ id => "C".$chid.".telepoche.com", 'display-name' => [ [ $chname ] ] });
    foreach (@offsets) {
        my $url = $GRID_BY_CHANNEL . "$chid&h=$_";
        push @to_get, [ $url, $chid, $_ ];
    }
}

my $bar = new Term::ProgressBar('Getting grid page', scalar @to_get)  if Have_bar && not $opt_quiet;
foreach (@to_get) {
    my ($url, $chid, $slot ) = @$_;
    process_channel_grid_page($writer, $chid, $url, $slot);
    update $bar if Have_bar && not $opt_quiet;
}
$writer->end();

#***************************************************************************
# Specific functions for grabbing information
#***************************************************************************
sub get_channels( $ ) {
    my $gridid = shift;
    my %channels;
    my $url = $GRID_BASE_URL.$gridid;

    my $t = HTML::TreeBuilder->new;
    $t->parse(tidy(get_page($url)));
    $t->eof;
    foreach my $cellTree ( $t->look_down( "_tag", "td", "width", "50", "height", "62" ) ) {
        my $chid = $cellTree->look_down( "_tag", "a" )->attr('href');
        $chid =~ /index_chaine.html\?c=(.*)\&b=/;
        $chid = $1;
        my $chname = $cellTree->look_down( "_tag", "img" )->attr('alt');
        $channels{$chid} = $chname;
    }
    $t->delete(); undef $t;
    return %channels;
}

sub process_channel_grid_page( $$$$ ) {
    my ($writer, $chid, $url, $slot) = @_;
    my ($genre, $showview, $hours, $starthour, $endhour, $date, $dateindex) = 0;
    my ($name, $subgenre, $footext, $star_rating, $datecreate) = 0;

    my $t = HTML::TreeBuilder->new;
    $t->parse(tidy(get_page($url)));
    $t->eof;
    # Each day is encapsulated in a table with the following parameters :
    foreach my $tableTree ($t->look_down('_tag', 'table', 'width', '532', 'name', "chn_$chid") ) {
        # Then we have the current date
        if ( my $cellTree = $tableTree->look_down('_tag', 'span', 'class' => 'sTxt') ) {
            if (my $dateTree = $cellTree->look_down('_tag', 'b') ) {
                my ($day, $month) = split (/\//,$dateTree->as_text);
                $date = &ParseDate("$month/$day/".&UnixDate("today","%Y"));
                $dateindex = &UnixDate($date, "%Y%m%d");
                next if (Date_Cmp($dateindex,&DateCalc("today","+$opt_offset days"))<0 || Date_Cmp($dateindex,&DateCalc("today","+$lastdaysoffset days"))>0 );
            } else {
                die "Malformated content on URL : $url \n";
            }
        }
        # Then the program information
        foreach my $progTree ($tableTree->look_down('_tag', 'a', 'onmouseout', 'nd()') ) {
            $_ = $progTree->attr('onmouseover');
            $_ = (!m/drc\(([^"]+)\)/);  # " To avoid dummy syntax color in some editors
            $1 =~ m/\'(.*)\',\'(.*)\'/;
            $name = $2;
            next if ( $2 eq 'Fin des programmes');
            ($hours, $genre, $showview) = split (/<br>/, $1);
            next if ( !$hours );
            # Process the title, sometimes a showview field is shown
            $name =~ s/^\d{7} //;
            $name =~ s/\\//g;
            if ($name =~ s/\s+([*]+)\s*$//) {
                my $n = length $1;
                if (0 < $n and $n <= $MAX_STARS) {
                    $star_rating = $n;
                } elsif ($MAX_STARS < $n) {
                    warn "too many stars ($n), expected at most $MAX_STARS\n";
                } else { die }
            }
            warn "Too many star-ratings for a programme : $name \n" if $name =~ /[*]$/;
            my @title_data = ([ xmlencoding($name), $LANG]);
            # Process hours, there are like HHhMM
            ($starthour, $endhour)  = split("-", $hours);
            $starthour =~ s/h//g;
            $endhour   =~ s/h//g;
            # Build the program table
            my $timeoffset = GetTimeOffset($dateindex.${starthour}."00");
            my $start = $dateindex.$starthour." ".$timeoffset;
            my $stop  = $dateindex.$endhour." ".$timeoffset;
            # Dummy site : the slot 0-4 of day n is in fact the slot 0-4 for day n+1
            if ( $slot == 24 ) {
                my $myslot = substr($starthour, 0, 2);
                $start = &UnixDate(&DateCalc($start, "+1 day"), "%Y%m%d%H%M %z") if ($myslot >= 0 && $myslot < 4);
                $stop  = &UnixDate(&DateCalc($stop, "+1 day"), "%Y%m%d%H%M %z");
            }
            # Last check to see if start > stop
            if ( Date_Cmp($start, $stop) > 0 ) {
                $stop = &UnixDate(&DateCalc($stop, "+1 day"), "%Y%m%d%H%M %z");
            }
            my %prog = (channel  => "C".$chid.".telepoche.com",
                        title    => \@title_data,
                        start    => $start,
                        stop     => $stop
                        );
            $prog{'star-rating'} = [$star_rating] if defined $star_rating;
            # Sometimes the genre is not set, so replace it by the showview field
            if (defined $genre and $genre =~ m/Showview : /) {
                $showview = $genre;
                undef $genre;
            }
            # Process the genre, subgenre and date if defined
            if  (defined $genre ) {
                ($genre, $datecreate) = split("-", $genre);
                ($genre, $subgenre)   = split(",", $genre);
                $genre = substr($genre, 1); # Dont know why, but the first car is ASCII(160) !!
                if (defined $subgenre) {
                    $subgenre =~ s/^\s+//;
                    $prog{category} = [ [ xmlencoding($genre), $LANG ], [ xmlencoding($subgenre), $LANG ] ];
                } else {
                    $prog{category} = [ [ xmlencoding($genre), $LANG ] ];
                }
                if (defined $datecreate) {
                    $datecreate = substr($datecreate, 1); # Dont know why, but the first car is ASCII(160) !!
                    $prog{date} = $datecreate ;
                }
             }
            # Process the showview field
            if ( defined $showview ) {
                $showview =~ s/Showview : //;
                $showview = substr($showview, 1); # Dont know why, but the first car is ASCII(160) !!
                $prog{showview} = $showview;
            }
            # Now get program description if the longlisting option is set
            if ( $opt_slow && $progTree->attr('class') eq 'fic' ) {
                my $id = $progTree->attr('href');
                my @desc;
                $id =~ m/javascript\:of\((\d+)\)/;
                $id = $1;
                my $tfic = HTML::TreeBuilder->new;
                $tfic->parse(tidy(get_page($SHEET_URL . $id)));
                $tfic->eof;
                if ( my $tdesc = $tfic->look_down('_tag', 'td', 'width', '396', 'class', 'txt') ) {
                    foreach my $cdesc ($tdesc->look_down('_tag', 'p', 'class', 'txt') ) {
                        $cdesc->delete_ignorable_whitespace();
                        my $desc = $cdesc->as_text;
                        chop($desc);
                        $desc =~ s/RESUME/RESUME : /g;
                        $desc =~ s/AVIS/AVIS : /g;
                        $desc =~ s/HISTOIRE/HISTOIRE : /g;
                        push @desc, $desc if (length($desc) > 10);
                    }
                }
                $prog{desc} = [ [ join('\n', @desc), $LANG ] ] if @desc;
            }
            if ( !$results{$prog{start}.$chid} ) {
                $results{$prog{start}.$chid} = "1";
                $writer->write_programme(\%prog);
            }
        }
    }
    $t->delete(); undef $t;
}
