#!/usr/bin/perl -w
#
# dislines: distribute in several files the lines of a tagged text.
# Version 1.1  (c) 12-7 to 26-7-2005  Daniel Clemente Laboreo
# http://www.danielclemente.com/dislines/
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
#
#
# Super-quick syntax guide:
# (full syntax at): http://www.danielclemente.com/dislines/syntax.en.html
# 
# Simple: @tag
# Multiple: @tag1,tag2,tag3
# Blocks: open with @{tag or @tag{  and close them with @}tag @tag} or @}
# Comments: @-----    or in a block. Any number of -
# Repeat last tag used: @"""""""     Any number of "
# You can use a lot of @ if you want: @@@@@@@@tag
# Common lines can also have it: @@ This is common
#
#
#
#
# The idea for the program is: 
#
# 1. Create a temporary file for the common lines
# 2. For every line of the file:
# 3. | If it's common (no tag), add it to the global file created in (1)
# 4. | If it has a tag or more attached, do for each tag:
# 5. | | If it's the first time we see this tag:
# 6. | | | Duplicate the global file to make a temporary file for this tag
# 7. | | Add the line to the temporary file of the tag
# 8. | Also add the line to the global file
# 9. Now turn the temporary files into the final ones and delete the global
#
#

# use v5.0; # or later

use strict;
# use warnings; # Not on <5.6.0.  I'm using -w

use Getopt::Long;
use File::Copy;
use File::Basename;


# 
# Customizable settings:
# 

# Change to 1 to better understand the code:
my $DEBUG=0;

# How to separate the "TAG" from the "file" in "file.TAG.ext"
my $SEP_CHAR='.'; # file.TAG.ext
# my $sep_char='-'; # file-TAG.ext   (eg. index-en.html)

# If you don't like ats (@), you can change the used symbol here.
# Of course, you must put the one you use in your files.
my $AT='@';


# Options from the command line
my (
	$just_list,
	$only_tags,
	$out_base,
	$just_stats,
	$show_help,
	$show_version,
	$quiet,
);

GetOptions (
	"list"    => \$just_list,
	"tags=s"  => \$only_tags,
	"out=s"   => \$out_base,
	"stats"   => \$just_stats,
	"quiet"   => \$quiet,
	"help"    => \$show_help,
	"version" => \$show_version,
)
	or die "Strange parameters passed. Stopping";


&show_help() if $show_help;
&show_version() if $show_version;


my $input_file;

if ($#ARGV ne 0) {
	print STDERR "I need the name of the file to process. See help (-h)\n";
	exit -1;
}

$input_file = pop @ARGV;



open (INPUT, $input_file) or &system_error("Can't open $input_file");

warn "Be careful: $input_file doesn't seem a text file. Warning"
	if not $quiet
	and not -T $input_file
	and not $input_file eq "-";


# Once opened, we just want its name, to create temporary files
$^O =~ s/^dos$/msdos/i;     # dosemu said just "dos"
fileparse_set_fstype($^O);  # and this requires "msdos"
$input_file = basename ($input_file);

# If not specified, use the same name as the base for the output files
$out_base ||= $input_file;

# Don't print messages if we're using STDOUT
$quiet=1 if $out_base eq "-";


my %file=();   # For each tag, it has its associated file handler.
               # tag "" means common text

my %lines=();  # For stats: How many lines has each file


&do_list() if $just_list;


print "Processing $input_file\n" if not $quiet;

my $last_tag="";  # Last tag seen ( for @"" )
my $in_block="";  # Are we inside a block? If so, which tags is it for?
                  # Both variables contain a list of tags, like "en,eo,it"


# Open buffer for common lines

# 'autovivify' on open() is just for >=5.6.0
# So is the 3 parameters version...
open ( COMMON, "+>". &temp_file_name("") )
	or &system_error("Can't create temporary file");
$file{""}=*COMMON;


while ( <INPUT> ) {
	
	my ($command, $rest_of_line);
	
	chomp; # No newlines ( \n ) at the end
	if ($DEBUG) {print "$.: $_\n";}

	($command, $rest_of_line)= /^${AT}+(.*?)(?: (.*))?$/o
		or ($command, $rest_of_line)=("",$_); # Common line (no tag)

	# Handle @en\n and others
	$rest_of_line="" if not defined $rest_of_line;


	# Skip comments: @---
	if ($command =~ /^\-+$/) {
		$lines{"-"}++ if $just_stats;
		next;
	}

	
	# Handle @{ and @}
	
	if ($command =~ /^\{(.*)$/ or $command =~ /^(.*)\{$/) {
		# Opening a block
		&bad_syntax("You can't nest blocks") if $in_block;
		&bad_syntax("Block with no tags") if not $1;
		$in_block=$1;
		if ($DEBUG) {print "Entering block $in_block\n";}
		next;
		
	} elsif ($command =~ /^\}(.*)$/ or $command =~ /^(.*)\}$/) {
		# Closing a block
		&bad_syntax("You closed block $1, but no block was opened")
			if (not $in_block);
		&bad_syntax("You closed block $1, but I was expecting $in_block")
			if ($1 and $1 ne $in_block);
		if ($DEBUG) {print "Going out of block $in_block\n";}
		$in_block="";
		next;

	}




	# Handle blocks, and @""

	if ($in_block) {
		&bad_syntax("Use of tag $command not allowed inside a block")
			if $command ne "";

		# Translate the block @{tag to a simple @tag for each line
		$command=$in_block;
		
	} else { # Things to do when you're not into a block
		$command=$last_tag if $command =~ /"+/;

	}


	if ($command eq "") {
		# Not tagged. It may be a common line, or rest inside a block

		if (not $in_block) { # Common line
			#   Add it to the global buffer ($file{""})
			# and to each known file
			foreach my $tag ( keys %file ) {
				print { $file{$tag} } "$rest_of_line\n";
				$lines{$tag}++ if $just_stats;
			}

			if($DEBUG) {print "Adding common line: $rest_of_line\n";}

			next; # No tags to process
		} else {
			$command=$in_block;
		}
	}
	

	# Handle things like @,,,,
	&bad_syntax("Don't do that with the commas") if $command =~ /^,+$/;

        # Iterate over the t1, t2, t3 of a @t1,t2,t3
	foreach my $tag ( split /,/, $command ) {
		&bad_syntax("Don't include null tag names")
			if $tag eq ""; # as in the case of a @a,,b
		&bad_syntax("Illegal tag name $tag (remove the '$&')")
			if $tag =~ /[\/\\\@\{\}]/;
		
		# now, tag is correct (it doesn't match /[ ,]/ nor /[\@{}]/ )

		if ($tag =~ /^\-*$/) { # it's a comment
			$lines{"-"}++ if $just_stats;
			next;
		}
			

		next if ($only_tags and $only_tags !~ /\b$tag\b/); # Not selected

		if (! $file{$tag} ) { # New tag found

			# Create temporary file for this tag

			local (*HANDLE);
			open ( HANDLE, "+>". &temp_file_name($tag) )
				or &system_error("Can't create temporary file");
			$file{$tag}=*HANDLE;

			# Copy all the lines from the global file to this one
			seek $file{""}, 0, 0
			and copy(  $file{""} , $file{$tag}  )
				or &system_error("Problem duplicating file");
			
			$lines{$tag}=$lines{""} if $just_stats;

			if ($DEBUG) {print "New tag found: $tag\n";}
		}
		
		# Write in each tag's buffer
		print { $file{$tag} } "$rest_of_line\n";
		$lines{$tag}++ if $just_stats;
	}

	# Make @"" work
	$last_tag=$command if not $in_block;


	if ($DEBUG) {print ">>>>> Command: $command\tLine: $rest_of_line\n";}

}

# No more lines to process

&bad_syntax("You forgot to close block $in_block")
	if $in_block;


# Just finished processing; if you wanted just the statistics, here are they
&do_stats() if $just_stats;


# Everything ok, so make the temporary files be the final ones

close INPUT;

close $file{""};
delete $file{""};
unlink &temp_file_name("")
	or &system_error("Error deleting temporary file");

foreach my $tag (keys %file) {
	my $temp=&temp_file_name($tag);
	my $name=&out_file_name($tag);

	if ($out_base ne "-") {
		close $file{$tag};
		move( $temp, $name )
			or &system_error("Error renaming temporary file");
	} else {
		# Write all files -concatenated- to STDOUT
		# Hope that the user had selected just one with -t

		my $handler = $file{$tag};
		seek $handler, 0, 0;
		print while (   <$handler> );
		close $file{$tag};

	}
		
		
	print "Wrote $name\n" if not $quiet;
}

&remove_temp() if ($out_base eq "-");


exit 0;
# And here ended the program



sub bad_syntax ($) {
	my $message = shift;
	print STDERR "Error at line $.: $message\n";
	&remove_temp();
	die "Stopping";
}

sub system_error ($) {
	my $message = shift;
	print STDERR "$message: $!\n";
	&remove_temp();
	die "Stopping";

}

sub remove_temp () {
	foreach my $tag (keys %file) {
		my $name = &temp_file_name($tag);
		if ($DEBUG) {print "Deleting temporary file $name\n";}

		# Don't stop if there are problems; we want most files removed
		unlink $name or warn("Could not delete $name: $!.");
	}
}


sub add_tag ($$) {

	# (file_name, $tag_name)
		
	# From a.html  -> a.TAG.html
	#      a.b.txt -> a.b.TAG.txt
	#      abc     -> abc.TAG
	#

	my $file_name=shift;
	my $tag_name=shift;

	my ($name, $dot_ext) = $file_name =~ /^(.*?)(\.[^\.]*)?$/;
	$dot_ext="" if not $dot_ext;

	#         file      .         TAG       .ext
	return "${name}${SEP_CHAR}${tag_name}${dot_ext}";
}	


sub temp_file_name ($) {
	#   Mmmm... you shouldn't use more than 1 point in your
	# file names if you're using FAT16.
	return "." . &add_tag($input_file, shift);

}

sub out_file_name ($) {
	# $out_base eq $input_file  by default (unless you specify --out)
	return &add_tag($out_base, shift);
}




sub do_list () {

	#   We're not doing a full parse of the file,
	# neither check whether it is valid.
	#   We just search for lines beginning with @ to extract the tags.


	my %seen_tags=();

	print "List of tags used in $input_file:\n" if not $quiet;

	while (<INPUT>) {

		if ($DEBUG) {print "$. $_\n";}
		
		# Take the command from the @command Rest_of_line
		next unless /^${AT}+(?!\@)([^ ]+)[ \n]/o;
	       
		if ($DEBUG) {print "The command seems $1\n";}
		
		foreach my $tag (split /[,\{\}]/, $1) {
			next if $tag =~ /[\\\/\@]/; # skip the erroneous tags
			next if not $tag;
			
			# skip @"" and @--
			next if $tag =~ /^\"+$/ or $tag =~ /^\-+$/;
			
			if ($DEBUG) {print "$tag\n";}
			
			$seen_tags{$tag}=1;
		}

	}

	print map { "$_\n" } keys %seen_tags;

	exit 0;

}


sub do_stats () {

	# We have the line count of each file. We can extract some information.
	
	my $total_input_lines = $.;
	my $common_lines = $lines{""} || 0;
	
	my $phra= "$input_file ($total_input_lines lines):";
	print $phra, "\n", '-'x(length $phra), "\n";

	# Get the maximum number of digits for some values
	my ($longest_tag,$longest_lines)=(0,0);
	foreach my $tag (keys %file) {
		next if $tag eq "";
		
		my $c;
		$c=&length_in_chars($tag);
		$longest_tag=$c if $c > $longest_tag;
		$c=length $lines{$tag};
		$longest_lines=$c if $c > $longest_lines;
	}
	$longest_tag= &length_in_chars(
		&out_file_name( 'x' x $longest_tag )
	); # Length of the full name, not just the largest tag
	

	foreach my $tag (keys %file) {
		next if $tag eq "";
		my $num=$lines{$tag};

		my $name=&out_file_name($tag);
		my $lin="%${longest_lines}u";  # Format string for the line counts
		
		print
		sprintf(
			' 'x($longest_tag-&length_in_chars($name)) .
			"%s: $lin lines = " .
			"$lin proper + $lin common\n",
			
		       	$name, $num,
			$num-$common_lines, $common_lines);
		
			
	}
	my $num_tags= scalar(keys %file) -1; # -1 because of the "" (not real)

	$lines{'-'} ||=0;
	print "Number of tags: $num_tags. Number of comments: $lines{'-'}.\n\n";

	&remove_temp();

	exit 0;
}


sub length_in_chars ($) {

	#   This is a little hack to get the length of a string
	# which we don't know if it's Unicode (UTF-8) or not.
	
	my $a = shift;

	# Perl can't know if $a is Unicode or not

	my $l = length $a; # Take its length as it if weren't
	
	# This sets the "utf8" bit for $b
	my $b;
        eval q{
		$b = pack "U0C*", unpack "C*", $a;
		#   We're hiding this from <5.6.0 versions,
		# which will get the length as if it weren't UTF-8
	};

	# Try to take its length as if it were Unicode
        eval q{
		#   If length detects some error (like malformed UTF-8 characters),
		# die and don't touch $l. Otherwise, it's UTF-8, so get the real $l
	        use warnings FATAL => 'all';
	        $l=length $b;
	};
	
	return $l;

}


sub show_version () {

	print <<'EOF';

dislines 1.0 < http://www.danielclemente.com/dislines/ >
Follows version 1 of the specification.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, version 2.

July 2005. Daniel Clemente Laboreo.

EOF

	exit 0;
}





sub show_help () {

	print <<'EOF';

Usage: dislines [OPTIONS] file.ext

Options:

 (nothing)         Process the file and create a file.TAG.ext for each tag
 -l, --list        Just show a list of the tags used in the file
 -t, --tags=LIST   Only include these tags. Ex: en,it,fr
 -o, --out=FILE    Use this base name for the output files (file.TAG.ext)
 -s, --stats       Just print some information about the files to be created
 -q, --quiet       Write only the necessary information to the screen
 -h, --help        Show this help (see the manual for the syntax)
 -v, --version     Print version information

 You may put a dash (-) instead of file names to use STDIN or STDOUT.
 
EOF

	exit 0;
}


# The end, really

