#!/usr/bin/perl -w
# by Chloe Lewis
# Usage : ./2utf <moniwiki datadir> <dest datadir>

# from piconv
use strict;
use Encode;
use Encode::Alias;
use POSIX qw(strftime);

my $LOCK_SH  = 1;
my $LOCK_EX  = 2;
my $LOCK_UN  = 8;

my $_VERBOSE = 1;

if ($#ARGV<1) { print "Usage : $0 <text directory> <new dir>\n"; exit 1; }
my $rootdir = $ARGV[0];
my $destdir = $ARGV[1];

&_utfme($rootdir,$destdir);
exit 0;

# recursive routine goes here
sub _utfme
{
  my $dir = shift;
  my $destdir = shift;

  mkdir $destdir if not -e $destdir;

  opendir DIR, $dir
    or die "cannot open `$dir'";
  my @list = grep { /^[^.]/ } readdir DIR;
  closedir DIR;

  # return if no files found
  unless (@list) { print "no files in $dir\n" if $_VERBOSE; return; }

  foreach (@list)
  {
    # recurse if directory
    my $fn_orig = "$_";
    if ( -d $dir."/".$fn_orig ) { &_utfme($dir."/".$fn_orig,$destdir.'/'.$fn_orig); next; }
    # dont check illegal euc-kr : convert every single char -ge 0x80 (too greedy)
    #( my $fn_euc = $fn_orig ) =~ s/_([0-9a-f]{2})/hex($1)>127?chr(hex($1)):$&/ge;
    my $fn_euc = &_get_fn_euc($fn_orig);
    my $fn_utf = $fn_euc;
    Encode::from_to($fn_utf, "euc-kr", "utf8");
    # dont check illegal utf-8 : convert every single char -ge 11000000
    #$fn_utf =~ s/./ord($&)>127?"_".unpack("H2",chr(ord($&))):$&/ge;
    $fn_utf = &_get_fn_utf($fn_utf);

    print STDERR 'convert '.$fn_orig.' to '.$fn_utf."\n";

    # conver contents
    #
    print "utf8ize $fn_euc..." if $_VERBOSE;
    open   IN, "<$dir/$fn_orig"
      or die "cannot open `$fn_orig' for reading";
    flock  IN, $LOCK_SH;
    open  OUT, ">$destdir/$fn_utf"
      or die "cannot open `$fn_utf' for writing";
    flock OUT, $LOCK_EX;
    while (<IN>)
    {
      Encode::from_to($_, "euc-kr", "utf8");
      print OUT $_;
    }
    flock OUT, $LOCK_UN;
    close OUT;
    flock  IN, $LOCK_UN;
    close  IN;

    my $time = (stat($dir."/".$fn_orig))[9];
    my $date = strftime ("%Y%m%d%H%M", gmtime($time));

    print STDERR 'convert '.$dir.'/'.$fn_orig." ".$destdir.'/'.$fn_utf."\n";
    `touch -t $date $destdir/$fn_utf`;
    print STDERR "done\n" if $_VERBOSE;
  }
  return;
}

# allow broken euc-kr
# returns unescaped euc-kr filename $f
sub _get_fn_euc
{

  my $pagename=$_;
  $pagename=~ s/_([a-f0-9]{2})/chr(hex($1))/eg;

#  my ($f, $prev) = (undef, undef);
#
#  foreach (split /_/, shift)
#  {
#    # is a possible piece of euc-kr ?
#    if ( m/^([0-9a-f]{2})(.*)/i && hex($1)>127 )
#    {
#      # keep if no prev piece
#      unless ($prev||$2) { $prev = $_; next; }
#      # constuct a korean char :)
#      $f .= chr(hex($prev)).chr(hex($1)).$2;
#      $prev = undef;
#    }
#    else
#    {
#      $f .= "_".$prev if $prev;
#      $f .= "_".$_;
#    }
#  }
#  # concat if the last piece left
#  $f .= "_".$prev if ($prev);
#  $f =~ s/^_+//;
#  return $f;
   return $pagename;
}

# returns escaped utf-8 filename $f from raw euc-kr
sub _get_fn_utf
{
  my ($f, $follow) = (undef, undef);

  foreach (map ord, split //, shift)
  {
    # no UTF-8 is being composed now
    unless ($follow)
    {
      # 0xxxxxxx : plain ascii
      unless ($_&0x80) { $f .= chr; next; }
      # 10xxxxxx : not a starting byte
      unless ($_&0x40) { $f .= chr; next; }
      # 11xxxxxx : the beginning of UTF-8 :)
      $f .= "_".unpack "H2", chr;
      unless ($_&0x20) { $follow = 1; next; }
      unless ($_&0x10) { $follow = 2; next; }
      unless ($_&0x08) { $follow = 3; next; }
      unless ($_&0x04) { $follow = 4; next; }
      unless ($_&0x02) { $follow = 5; next; }
      $follow = undef;
      next;
    }
    # following char
    $f .= "_".unpack "H2", chr;
    $follow --;
  }
  $f=~ s/\//_2f/g;
  return $f;
}

# /bin/mkdir -p from $_
sub _mkdir_p
{
  my $f = shift;

  # suppose no unusual chars are contained in the fullpath
  unless ($f=~m,/$,) { $f=~s,/[^/]+$,, && &_mkdir_p($f); }

  return $f if -d $f;
  mkdir $f or &_mkdir_p($f);
  return $f;
}

