#!/usr/bin/perl -w # # dislines: distribute in several files the lines of a tagged text. # Version 1.1 (c) 12-7 to 26-7-2005 Daniel Clemente Laboreo # http://www.danielclemente.com/dislines/ # # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # # Super-quick syntax guide: # (full syntax at): http://www.danielclemente.com/dislines/syntax.en.html # # Simple: @tag # Multiple: @tag1,tag2,tag3 # Blocks: open with @{tag or @tag{ and close them with @}tag @tag} or @} # Comments: @----- or in a block. Any number of - # Repeat last tag used: @""""""" Any number of " # You can use a lot of @ if you want: @@@@@@@@tag # Common lines can also have it: @@ This is common # # # # # The idea for the program is: # # 1. Create a temporary file for the common lines # 2. For every line of the file: # 3. | If it's common (no tag), add it to the global file created in (1) # 4. | If it has a tag or more attached, do for each tag: # 5. | | If it's the first time we see this tag: # 6. | | | Duplicate the global file to make a temporary file for this tag # 7. | | Add the line to the temporary file of the tag # 8. | Also add the line to the global file # 9. Now turn the temporary files into the final ones and delete the global # # # use v5.0; # or later use strict; # use warnings; # Not on <5.6.0. I'm using -w use Getopt::Long; use File::Copy; use File::Basename; # # Customizable settings: # # Change to 1 to better understand the code: my $DEBUG=0; # How to separate the "TAG" from the "file" in "file.TAG.ext" my $SEP_CHAR='.'; # file.TAG.ext # my $sep_char='-'; # file-TAG.ext (eg. index-en.html) # If you don't like ats (@), you can change the used symbol here. # Of course, you must put the one you use in your files. my $AT='@'; # Options from the command line my ( $just_list, $only_tags, $out_base, $just_stats, $show_help, $show_version, $quiet, ); GetOptions ( "list" => \$just_list, "tags=s" => \$only_tags, "out=s" => \$out_base, "stats" => \$just_stats, "quiet" => \$quiet, "help" => \$show_help, "version" => \$show_version, ) or die "Strange parameters passed. Stopping"; &show_help() if $show_help; &show_version() if $show_version; my $input_file; if ($#ARGV ne 0) { print STDERR "I need the name of the file to process. See help (-h)\n"; exit -1; } $input_file = pop @ARGV; open (INPUT, $input_file) or &system_error("Can't open $input_file"); warn "Be careful: $input_file doesn't seem a text file. Warning" if not $quiet and not -T $input_file and not $input_file eq "-"; # Once opened, we just want its name, to create temporary files $^O =~ s/^dos$/msdos/i; # dosemu said just "dos" fileparse_set_fstype($^O); # and this requires "msdos" $input_file = basename ($input_file); # If not specified, use the same name as the base for the output files $out_base ||= $input_file; # Don't print messages if we're using STDOUT $quiet=1 if $out_base eq "-"; my %file=(); # For each tag, it has its associated file handler. # tag "" means common text my %lines=(); # For stats: How many lines has each file &do_list() if $just_list; print "Processing $input_file\n" if not $quiet; my $last_tag=""; # Last tag seen ( for @"" ) my $in_block=""; # Are we inside a block? If so, which tags is it for? # Both variables contain a list of tags, like "en,eo,it" # Open buffer for common lines # 'autovivify' on open() is just for >=5.6.0 # So is the 3 parameters version... open ( COMMON, "+>". &temp_file_name("") ) or &system_error("Can't create temporary file"); $file{""}=*COMMON; while ( ) { my ($command, $rest_of_line); chomp; # No newlines ( \n ) at the end if ($DEBUG) {print "$.: $_\n";} ($command, $rest_of_line)= /^${AT}+(.*?)(?: (.*))?$/o or ($command, $rest_of_line)=("",$_); # Common line (no tag) # Handle @en\n and others $rest_of_line="" if not defined $rest_of_line; # Skip comments: @--- if ($command =~ /^\-+$/) { $lines{"-"}++ if $just_stats; next; } # Handle @{ and @} if ($command =~ /^\{(.*)$/ or $command =~ /^(.*)\{$/) { # Opening a block &bad_syntax("You can't nest blocks") if $in_block; &bad_syntax("Block with no tags") if not $1; $in_block=$1; if ($DEBUG) {print "Entering block $in_block\n";} next; } elsif ($command =~ /^\}(.*)$/ or $command =~ /^(.*)\}$/) { # Closing a block &bad_syntax("You closed block $1, but no block was opened") if (not $in_block); &bad_syntax("You closed block $1, but I was expecting $in_block") if ($1 and $1 ne $in_block); if ($DEBUG) {print "Going out of block $in_block\n";} $in_block=""; next; } # Handle blocks, and @"" if ($in_block) { &bad_syntax("Use of tag $command not allowed inside a block") if $command ne ""; # Translate the block @{tag to a simple @tag for each line $command=$in_block; } else { # Things to do when you're not into a block $command=$last_tag if $command =~ /"+/; } if ($command eq "") { # Not tagged. It may be a common line, or rest inside a block if (not $in_block) { # Common line # Add it to the global buffer ($file{""}) # and to each known file foreach my $tag ( keys %file ) { print { $file{$tag} } "$rest_of_line\n"; $lines{$tag}++ if $just_stats; } if($DEBUG) {print "Adding common line: $rest_of_line\n";} next; # No tags to process } else { $command=$in_block; } } # Handle things like @,,,, &bad_syntax("Don't do that with the commas") if $command =~ /^,+$/; # Iterate over the t1, t2, t3 of a @t1,t2,t3 foreach my $tag ( split /,/, $command ) { &bad_syntax("Don't include null tag names") if $tag eq ""; # as in the case of a @a,,b &bad_syntax("Illegal tag name $tag (remove the '$&')") if $tag =~ /[\/\\\@\{\}]/; # now, tag is correct (it doesn't match /[ ,]/ nor /[\@{}]/ ) if ($tag =~ /^\-*$/) { # it's a comment $lines{"-"}++ if $just_stats; next; } next if ($only_tags and $only_tags !~ /\b$tag\b/); # Not selected if (! $file{$tag} ) { # New tag found # Create temporary file for this tag local (*HANDLE); open ( HANDLE, "+>". &temp_file_name($tag) ) or &system_error("Can't create temporary file"); $file{$tag}=*HANDLE; # Copy all the lines from the global file to this one seek $file{""}, 0, 0 and copy( $file{""} , $file{$tag} ) or &system_error("Problem duplicating file"); $lines{$tag}=$lines{""} if $just_stats; if ($DEBUG) {print "New tag found: $tag\n";} } # Write in each tag's buffer print { $file{$tag} } "$rest_of_line\n"; $lines{$tag}++ if $just_stats; } # Make @"" work $last_tag=$command if not $in_block; if ($DEBUG) {print ">>>>> Command: $command\tLine: $rest_of_line\n";} } # No more lines to process &bad_syntax("You forgot to close block $in_block") if $in_block; # Just finished processing; if you wanted just the statistics, here are they &do_stats() if $just_stats; # Everything ok, so make the temporary files be the final ones close INPUT; close $file{""}; delete $file{""}; unlink &temp_file_name("") or &system_error("Error deleting temporary file"); foreach my $tag (keys %file) { my $temp=&temp_file_name($tag); my $name=&out_file_name($tag); if ($out_base ne "-") { close $file{$tag}; move( $temp, $name ) or &system_error("Error renaming temporary file"); } else { # Write all files -concatenated- to STDOUT # Hope that the user had selected just one with -t my $handler = $file{$tag}; seek $handler, 0, 0; print while ( <$handler> ); close $file{$tag}; } print "Wrote $name\n" if not $quiet; } &remove_temp() if ($out_base eq "-"); exit 0; # And here ended the program sub bad_syntax ($) { my $message = shift; print STDERR "Error at line $.: $message\n"; &remove_temp(); die "Stopping"; } sub system_error ($) { my $message = shift; print STDERR "$message: $!\n"; &remove_temp(); die "Stopping"; } sub remove_temp () { foreach my $tag (keys %file) { my $name = &temp_file_name($tag); if ($DEBUG) {print "Deleting temporary file $name\n";} # Don't stop if there are problems; we want most files removed unlink $name or warn("Could not delete $name: $!."); } } sub add_tag ($$) { # (file_name, $tag_name) # From a.html -> a.TAG.html # a.b.txt -> a.b.TAG.txt # abc -> abc.TAG # my $file_name=shift; my $tag_name=shift; my ($name, $dot_ext) = $file_name =~ /^(.*?)(\.[^\.]*)?$/; $dot_ext="" if not $dot_ext; # file . TAG .ext return "${name}${SEP_CHAR}${tag_name}${dot_ext}"; } sub temp_file_name ($) { # Mmmm... you shouldn't use more than 1 point in your # file names if you're using FAT16. return "." . &add_tag($input_file, shift); } sub out_file_name ($) { # $out_base eq $input_file by default (unless you specify --out) return &add_tag($out_base, shift); } sub do_list () { # We're not doing a full parse of the file, # neither check whether it is valid. # We just search for lines beginning with @ to extract the tags. my %seen_tags=(); print "List of tags used in $input_file:\n" if not $quiet; while () { if ($DEBUG) {print "$. $_\n";} # Take the command from the @command Rest_of_line next unless /^${AT}+(?!\@)([^ ]+)[ \n]/o; if ($DEBUG) {print "The command seems $1\n";} foreach my $tag (split /[,\{\}]/, $1) { next if $tag =~ /[\\\/\@]/; # skip the erroneous tags next if not $tag; # skip @"" and @-- next if $tag =~ /^\"+$/ or $tag =~ /^\-+$/; if ($DEBUG) {print "$tag\n";} $seen_tags{$tag}=1; } } print map { "$_\n" } keys %seen_tags; exit 0; } sub do_stats () { # We have the line count of each file. We can extract some information. my $total_input_lines = $.; my $common_lines = $lines{""} || 0; my $phra= "$input_file ($total_input_lines lines):"; print $phra, "\n", '-'x(length $phra), "\n"; # Get the maximum number of digits for some values my ($longest_tag,$longest_lines)=(0,0); foreach my $tag (keys %file) { next if $tag eq ""; my $c; $c=&length_in_chars($tag); $longest_tag=$c if $c > $longest_tag; $c=length $lines{$tag}; $longest_lines=$c if $c > $longest_lines; } $longest_tag= &length_in_chars( &out_file_name( 'x' x $longest_tag ) ); # Length of the full name, not just the largest tag foreach my $tag (keys %file) { next if $tag eq ""; my $num=$lines{$tag}; my $name=&out_file_name($tag); my $lin="%${longest_lines}u"; # Format string for the line counts print sprintf( ' 'x($longest_tag-&length_in_chars($name)) . "%s: $lin lines = " . "$lin proper + $lin common\n", $name, $num, $num-$common_lines, $common_lines); } my $num_tags= scalar(keys %file) -1; # -1 because of the "" (not real) $lines{'-'} ||=0; print "Number of tags: $num_tags. Number of comments: $lines{'-'}.\n\n"; &remove_temp(); exit 0; } sub length_in_chars ($) { # This is a little hack to get the length of a string # which we don't know if it's Unicode (UTF-8) or not. my $a = shift; # Perl can't know if $a is Unicode or not my $l = length $a; # Take its length as it if weren't # This sets the "utf8" bit for $b my $b; eval q{ $b = pack "U0C*", unpack "C*", $a; # We're hiding this from <5.6.0 versions, # which will get the length as if it weren't UTF-8 }; # Try to take its length as if it were Unicode eval q{ # If length detects some error (like malformed UTF-8 characters), # die and don't touch $l. Otherwise, it's UTF-8, so get the real $l use warnings FATAL => 'all'; $l=length $b; }; return $l; } sub show_version () { print <<'EOF'; dislines 1.0 < http://www.danielclemente.com/dislines/ > Follows version 1 of the specification. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. July 2005. Daniel Clemente Laboreo. EOF exit 0; } sub show_help () { print <<'EOF'; Usage: dislines [OPTIONS] file.ext Options: (nothing) Process the file and create a file.TAG.ext for each tag -l, --list Just show a list of the tags used in the file -t, --tags=LIST Only include these tags. Ex: en,it,fr -o, --out=FILE Use this base name for the output files (file.TAG.ext) -s, --stats Just print some information about the files to be created -q, --quiet Write only the necessary information to the screen -h, --help Show this help (see the manual for the syntax) -v, --version Print version information You may put a dash (-) instead of file names to use STDIN or STDOUT. EOF exit 0; } # The end, really