#!/usr/bin/perl -w
#
# 2006 (c) Novell inc. 
# jw@suse.de, distribute under GPLv2
#
# tess2col.pl -- a multicol converter for tesseract-ocr output
#
# Algorithm to find the exact position where to split columns:
# - ignore all whitespace seperators of length 1.
# - weight all other whitespace seperators by their length.
# - reduce weight for $o distance to the nearest $colwidth column:
#   by a factor of ($colwidth-$o)/$colwidth, (which has a max of 2).
# - within each [0.5..1.5]*$colwidth chop line at space string with highest
#   weight.
# - Lines shorter than $colwidth are repeated into every column.
#   Only one column has text in this line, but teseract does not 
#   tell us which column it was. There is no space padding in this case.
#
# enhancement: should do a dry run and find out number of columns.
#  so that short lines (covering 2 columns of a 3 column layout) can be found.
#  currently this only works for a 2 column layout.
#
# enhancement: should do a dry run and autodetect the column width.

use Data::Dumper;	# for debugging only.

my $version = '1.0';
my $colwidth_def = 66;
my $colwidth = $ARGV[0] || $colwidth_def;
my $firstline = 0;		# split processing starts here
my $lastline = 999;		# splint processing ends here
my $pagebreak = '[%-- next_column --%]';	# how we seperate our output

my $lineno = 0;
my $morepages = 0;
my @l;

unless ($colwidth =~ m{^\d+$})
  {
    print qq{tess2col V$version
    
Usage: 
    $0 [chars_per_col] < tesseract-ocr-output.txt > single_columns.txt

Future options:

-w chars		Number of characters per column. Default: $colwidth_def.
-f firstline		First line to process. Default: $firstline.
-l lastline		Last line to process. Default: $lastline.
-s col_sep_string	Column separator. Default: '$pagebreak'

};
    exit 0;
  }

while (defined (my $line = <STDIN>))
  {
    chomp $line;
    $lineno++;
    my @c = split_cols($line);
    print "$c[0]\n";
    shift @c;
    push @l, [@c];
    $morepages++ if scalar @c;
  }

while ($morepages)
  {
    print "\n$pagebreak\n";
    $morepages = 0;
    for my $l (@l)
      {
        my $a = shift @{$l} || '';
        print "$a\n";
	$morepages++ if scalar @{$l};
      }
  }

exit 0;
##################################################
sub sep_weight
{
  my ($center, $size) = @_;
  my $w = $size;
  my $o = $center % $colwidth;
  my $delta = $colwidth-$o;
  $delta = $o if $center >= $colwidth/2 and $delta > $colwidth/2;
  # delta is now the distance to the nearest $colwidth multiple > 0.

  my $f = ($colwidth-$delta)/$colwidth;

  ##
  ## need a steep fall-off function.
  ## quadratic is far too weak for
  ## "Other than the then current. unaltered version oflhe applicable      immediate termination of this Agreement.  Distributor shall"
  ##
  return $size * $f * $f * $f * $f;	
}

sub split_cols
{
  my ($line) = @_;

  return ("[%?%]$line", "[%?%]$line") if length($line) < $colwidth + 3;	# wild guess: 5 spaces centered on colwidth delimit columns.
  $line .= '  ';		# a dummy padding to prevent break before the last word.
  my $linelen = length $line;
  my @s;
  while ($line =~ m{(\s\s+)}g)
    {
      my $n = length $1;
      push @s, [pos($line)-$n, $n, sep_weight(pos($line)-$n/2, $n)];
    }

# die Dumper \@s;
  my @line;
  my $startpos = 0;

  my $col = 0;
  while ($col < $linelen)
    {
      $col += $colwidth;

      # hit the end exactly, so that the weigth of the trailing space is high.
      $col = $linelen+1 if $col > $linelen;	

      my $best_idx = -1;
      my $max_weight = -1;
      for my $i (0..$#s)
        {
	  next if $s[$i][0]+$s[$i][1]/2 <= $col - $colwidth/2 or 
	          $s[$i][0]+$s[$i][1]/2 >  $col + $colwidth/2;

# print "testing $s[$i][0] with $col\n";
	  if ($s[$i][2] > $max_weight)
	    {
	      $max_weight = $s[$i][2];
	      $best_idx = $i;
# print "new best: $max_weight\n";
	    }
        }
      push @line, substr $line, $startpos, $s[$best_idx][0]-$startpos;
      $startpos = $s[$best_idx][0]+$s[$best_idx][1];
    }

  push @line, substr $line, $startpos, $linelen-$startpos-1 if $startpos and $startpos < $linelen;
  pop @line unless length $line[-1];

#  die Dumper \@line;
  return @line;
}
