Tokens: Programme to discover tokens, where there are not.
Now available at https://puszcza.gnu.org.ua/projects/tokens/
This is Version 2, for version 1, go here.
Synopsis:
use TokensV2;
sub printFile;
my @FORMAT = (
['<Message Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From>(?:<User FriendlyName=".*?"/>)+</From><To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style=".*?")?>.*?</Text></Message>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Message Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?">(<From>(?:<User FriendlyName=".*?"/>)+</From>)<To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style="(.*?)")?>(.*?)</Text></Message>|;
my $F = join '<br />', $f =~ m|<User FriendlyName="(.*?)"/>|g;
print $fh "<p><font size=\"-2\">($d $t)</font> $F<br /><span style=\"$s\">$T</span></p>";
}
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></Invitation>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></Invitation>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
],
['<Join Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Join>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Join Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Join>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
],
['<Leave Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Leave>',
sub {
my $fh = $_[1];
my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Leave Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Leave>|;
print $fh "<p><font size=\"-2\">($d $t)</font> $f<br /><span style=\"$s\">$T</span></p>";
}
]
);
sub printFile {
return if @_==1;
if (@_==2) {
my $fh = $_[1];
${_[0]} =~ s/</</g;
print $fh "<p>${_[0]}</p>\n";
return;
}
# @_==5
printFile @{$_[1]}, $_[4];
my $coderef = $FORMAT[$_[3]][1];
&$coderef($_[0], $_[4]);
my $fh = $_[4];
print $fh "\n";
printFile @{$_[2]}, $_[4];
}
sub joinParseTree {
return if @_==0;
if (@_==1) {
return ${_[0]};
}
# @_==4
return joinParseTree(@{$_[1]}) . $_[0] . joinParseTree(@{$_[2]});
}
open my $fH, "<:encoding(utf8)", "tokens-processed.txt"; # H for HASH, not handle
my %HASH = map { /([^\s]*)/; $1 => undef } grep $_ ne "\n", <$fH>;
close $fH;
open $fH, ">>:encoding(utf8)", "tokens-processed.txt";
print $fH "----------bookmark----------\n";
local $/ = undef;
my $regex = join "\n", map $$_[0], @FORMAT;
while(glob "TXT/**") {
print "$_\n";
next if exists $HASH{$_};
open my $fh, "<:encoding(utf8)", $_;
my $text = <$fh>;
my $oName = $_;
$oName =~ s/TXT/HTML/;
$oName =~ s/\.txt/.html/;
my @parse_tree;
eval {
local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
alarm 5;
@parse_tree = parse($text, $regex);
alarm 0;
};
if ($@) {
die unless $@ eq "alarm\n"; # propagate unexpected errors
# timed out
print $fH "$_ timed out\n";
next;
}
my $parse_text = joinParseTree @parse_tree;
open $fhHTML, ">:encoding(utf8)", $oName;
print $fhHTML <<ENDDOC
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
</head>
<body>
ENDDOC
;
printFile @parse_tree, $fhHTML;
$text =~ s/-( *)(?=-)/- \1/g;
$text =~ s/(.{1,512})/<!-- \1 -->\n/g;
print $fhHTML "
<!-- Original text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
$text";
$parse_text =~ s/-( *)(?=-)/- \1/g;
$parse_text =~ s/(.{1,512})/<!-- \1 -->\n/g;
print $fhHTML "
<!-- Parsed text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
$parse_text
</body>
</html>";
close $fhHTML;
print $fH "$_\n";
}
TokensV2.pm file:
=pod
Tokens: Programme to discover tokens, where there are not.
Copyright 2013 Gabriel Czernikier
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
=cut
use strict;
#package declarations
sub digest_single;
sub parse_programme;
sub printAll;
my @REGEX;
my @MASK;
my $long_tokens;
my @DIGEST_SINGLE;
my $literal_char_count;
sub digest_single { # $target, $regex, $eaten_left_literal_chars, $eaten_right_literal_chars
#return unless $_[1] ne '';
#return if length $_[1] < 8; # x>(.*?)<
return unless ($_[2]+$_[3])/$literal_char_count<0.999999;
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
pos($_[0]) = undef;
goto VISITING unless $_[0] =~ /(${_[1]})/g;
my $digit = 2;
while( my $cg = eval '$'.$digit++ ) {
my @suspicious_tokens = $cg =~ /$long_tokens/g;
goto VISITING if @suspicious_tokens>=2;
}
my $pff = (pos $_[0]);
my $pii = $pff - (length $1) if defined $pff;
$DIGEST_SINGLE[$_[2]][$_[3]] = [$_[2], $_[3], $pii, $pff];
return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
VISITING:
my $re = $_[1];
# strip off left shelter, also understood as walking righwards across the regex .+? stopping at a hopefuly serviceable string
goto VISITING_2 unless $re =~ s/(.+?)(.\(\.\*\?\).|\([^.])/\2/; # .+? is also the "shelter"
my $increment_eaten_literal_chars = $+[1];
$increment_eaten_literal_chars -= 5 if $1 =~ /\(\.\*\?\)/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
# expansions
$re =~ s/^\(\?:(?:[^)]|\)[^?+*])+\)\*//;
$re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\+/\1\(\?:\1\)\*/;
$re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\?/\1/;
my ($el, $er, $pi, $pf) = digest_single $_[0], $re, $_[2]+$increment_eaten_literal_chars, $_[3];
VISITING_2:
$re = $_[1];
# strip off right shelter, also understood as walking leftwards across the regex .+? stopping at a hopefuly serviceable string
goto SUBDIGEST unless $re =~ s/(.*(?:.\(\.\*\?\).|\)[?+*]))(.+?)$/\1/; # .+? is the "shelter"
$increment_eaten_literal_chars = $+[2] - $-[2];
$increment_eaten_literal_chars -= 5 if $2 =~ /\(\.\*\?\)/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
# expansions
$re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\*$//;
$re =~ s/\(\?:((?:[^)]|\)[^?+*])+)\)\+$/\(\?:\1\)\*\1/;
$re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\?$//;
my ($el2, $er2, $pi2, $pf2) = digest_single $_[0], $re, $_[2], $_[3]+$increment_eaten_literal_chars;
SUBDIGEST:
$DIGEST_SINGLE[$_[2]][$_[3]] = [$el, $er, $pi, $pf] if (defined $pi) && ($er+$el<=$er2+$el2 || !defined $pi2);
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
$DIGEST_SINGLE[$_[2]][$_[3]] = [$el2, $er2, $pi2, $pf2] if (defined $pi2) && ($er2+$el2<$er+$el || !defined $pi);
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
$DIGEST_SINGLE[$_[2]][$_[3]] = [];
return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
}
sub digest_multiple {
my $regex_num = 0;
my $Pos_Ini;
my $Eaten_Left;
my $Eaten_Right;
my $Pos_Fin;
my $Regex_Num;
my @DIGEST_MULTIPLE = ();
for my $regex (@REGEX) {
@DIGEST_SINGLE = undef;
my @literal_chars = $regex =~ /[^.*(?:)+]/g;
$literal_char_count = scalar @literal_chars;
my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = digest_single $_[0], $regex, 0, 0;
return $eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num if(($eaten_right+$eaten_left)/$literal_char_count<0.2);
push @DIGEST_MULTIPLE, [$eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini && $pos_fin!=0;
$regex_num++;
}
$regex_num = 0;
while(@DIGEST_MULTIPLE!=0) {
my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
if($eaten_right+$eaten_left<$Eaten_Right+$Eaten_Left || !defined $Pos_Ini) {
$Pos_Ini = $pos_ini;
$Eaten_Left = $eaten_left;
$Eaten_Right = $eaten_right;
$Pos_Fin = $pos_fin;
$Regex_Num = $regex_num;
}
$regex_num++;
}
return $Eaten_Left, $Eaten_Right, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
}
sub parse_programme {
return if $_[0] eq '';
return $_[0] if length $_[0] < 8; # <x>y</x>
my ($eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num) = digest_multiple($_[0]);
if(not defined $pos_ini) {
#my $oldfh = select;
#select STDERR;
#print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
#select $oldfh;
return $_[0];
}
my $mask_right = @MASK[$regex_num];
$mask_right =~ s/.*(.{$eaten_right})/\1/;
my $mask_left = @MASK[$regex_num];
$mask_left =~ s/(.{$eaten_left}).*/\1/;
my $match_length = $pos_fin-$pos_ini;
my ($target_left,$match,$target_right) = $_[0] =~ /(.{$pos_ini})(.{$match_length})(.*)/;
shift; # discard unused argument
return $mask_left.$match.$mask_right, +[parse_programme $target_left], +[parse_programme $target_right], $regex_num;
}
sub printAll {
return if @_==0;
if (@_==1) {
print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
return;
}
printAll @{$_[1]};
print "[ini-prod]${_[0]}[fin-prod]\n";
printAll @{$_[2]};
}
sub parse {
@REGEX = ();
@MASK = ();
my $target = shift;
my $regex = shift;
while($regex =~ /^(.+)$/mg) {
my $_ = $1;
my $other = $_;
# strip out non-literal chars from MASK
$other =~ s/\(\?:(.+?)\)\+/\1/g;
$other =~ s/\(\?:(.+?)\)\?/\1/g;
$other =~ s/\.\*\?//g;
push @MASK, $other;
# surround wildcards with capturing group for REGEX
s/(\.\*\?)/\(\1\)/g;
push @REGEX, $_;
}
$long_tokens = '\b' . join '\b|\b', grep length>=3,keys %{+{ map +($_=>undef), map /\w+/g, @REGEX }};
$long_tokens = $long_tokens . '\b';
parse_programme $target;
}
1;
No hay comentarios:
Publicar un comentario