Mostrando entradas con la etiqueta parse. Mostrar todas las entradas
Mostrando entradas con la etiqueta parse. Mostrar todas las entradas

martes, 18 de febrero de 2014

Tokens V2

Tokens: Programme to discover tokens, where there are not.

Now available at https://puszcza.gnu.org.ua/projects/tokens/
This is Version 2, for version 1, go here.

Synopsis:
use TokensV2;

sub printFile;
my @FORMAT = (
['<Message Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From>(?:<User FriendlyName=".*?"/>)+</From><To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style=".*?")?>.*?</Text></Message>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Message Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?">(<From>(?:<User FriendlyName=".*?"/>)+</From>)<To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style="(.*?)")?>(.*?)</Text></Message>|;
    my $F = join '<br />', $f =~ m|<User FriendlyName="(.*?)"/>|g;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $F<br /><span style=\"$s\">$T</span></p>";
  }
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></Invitation>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></Invitation>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
],
['<Join Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Join>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Join Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Join>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
],
['<Leave Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Leave>',
  sub {
    my $fh = $_[1];
    my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Leave Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Leave>|;
    print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
  }
]
);

sub printFile {
  return if @_==1;
  if (@_==2) {
    my $fh = $_[1];
    ${_[0]} =~ s/</&lt;/g;
    print $fh "<p>${_[0]}</p>\n";
    return;
  }
  # @_==5
  printFile @{$_[1]}, $_[4];
  my $coderef = $FORMAT[$_[3]][1];
  &$coderef($_[0], $_[4]);
  my $fh = $_[4];
  print $fh "\n";
  printFile @{$_[2]}, $_[4];
}

sub joinParseTree {
  return if @_==0;
  if (@_==1) {
    return ${_[0]};
  }
  # @_==4
  return joinParseTree(@{$_[1]}) . $_[0] . joinParseTree(@{$_[2]});
}

open my $fH, "<:encoding(utf8)", "tokens-processed.txt"; # H for HASH, not handle
my %HASH = map { /([^\s]*)/; $1 => undef } grep $_ ne "\n", <$fH>;
close $fH;
open $fH, ">>:encoding(utf8)", "tokens-processed.txt";
print $fH "----------bookmark----------\n";

local $/ = undef;

my $regex = join "\n", map $$_[0], @FORMAT;
while(glob "TXT/**") {
  print "$_\n";
  next if exists $HASH{$_};
  open my $fh, "<:encoding(utf8)", $_;
  my $text = <$fh>;
  my $oName = $_;
  $oName =~ s/TXT/HTML/;
  $oName =~ s/\.txt/.html/;
  my @parse_tree;
    eval {
            local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
            alarm 5;
      @parse_tree = parse($text, $regex);
            alarm 0;
    };
    if ($@) {
            die unless $@ eq "alarm\n";   # propagate unexpected errors
      # timed out
      print $fH "$_ timed out\n";
      next;
    }
  my $parse_text = joinParseTree @parse_tree;
  open $fhHTML, ">:encoding(utf8)", $oName;
  print $fhHTML <<ENDDOC
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
</head>
<body>
ENDDOC
;
  printFile @parse_tree, $fhHTML;
  $text =~ s/-( *)(?=-)/- \1/g;
  $text =~ s/(.{1,512})/<!-- \1 -->\n/g;
  print $fhHTML "
<!-- Original text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
$text";
  $parse_text =~ s/-( *)(?=-)/- \1/g;
  $parse_text =~ s/(.{1,512})/<!-- \1 -->\n/g;
  print $fhHTML "
<!-- Parsed text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
$parse_text
</body>
</html>";
  close $fhHTML;
  print $fH "$_\n";
}




TokensV2.pm file:

=pod
  Tokens: Programme to discover tokens, where there are not.

 Copyright 2013 Gabriel Czernikier



    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see .
=cut
use strict;
#package declarations
sub digest_single;
sub parse_programme;
sub printAll;

my @REGEX;
my @MASK;
my $long_tokens;
my @DIGEST_SINGLE;
my $literal_char_count;

sub digest_single {   # $target, $regex, $eaten_left_literal_chars, $eaten_right_literal_chars
  #return unless $_[1] ne '';
  #return if length $_[1] < 8;   # x>(.*?)<
  return unless ($_[2]+$_[3])/$literal_char_count<0.999999;
  return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
  pos($_[0]) = undef;
  goto VISITING unless $_[0] =~ /(${_[1]})/g;
  my $digit = 2;
  while( my $cg = eval '$'.$digit++ ) {
    my @suspicious_tokens = $cg =~ /$long_tokens/g;
    goto VISITING if @suspicious_tokens>=2;
  }
  my $pff = (pos $_[0]);
  my $pii = $pff - (length $1) if defined $pff;
  $DIGEST_SINGLE[$_[2]][$_[3]] = [$_[2], $_[3], $pii, $pff];
  return @{$DIGEST_SINGLE[$_[2]][$_[3]]};

  VISITING:
  my $re = $_[1];
  # strip off left shelter, also understood as walking righwards across the regex .+? stopping at a hopefuly serviceable string
  goto VISITING_2 unless $re =~ s/(.+?)(.\(\.\*\?\).|\([^.])/\2/;  # .+? is also the "shelter"
  my $increment_eaten_literal_chars = $+[1];
  $increment_eaten_literal_chars -= 5 if $1 =~ /\(\.\*\?\)/;   # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
  # expansions
  $re =~ s/^\(\?:(?:[^)]|\)[^?+*])+\)\*//;
  $re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\+/\1\(\?:\1\)\*/;
  $re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\?/\1/;
  my ($el, $er, $pi, $pf) = digest_single $_[0], $re, $_[2]+$increment_eaten_literal_chars, $_[3];

  VISITING_2:
  $re = $_[1];
  # strip off right shelter, also understood as walking leftwards across the regex .+? stopping at a hopefuly serviceable string
  goto SUBDIGEST unless $re =~ s/(.*(?:.\(\.\*\?\).|\)[?+*]))(.+?)$/\1/;  # .+? is the "shelter"

  $increment_eaten_literal_chars = $+[2] - $-[2];
  $increment_eaten_literal_chars -= 5 if $2 =~ /\(\.\*\?\)/;   # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
  # expansions
  $re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\*$//;
  $re =~ s/\(\?:((?:[^)]|\)[^?+*])+)\)\+$/\(\?:\1\)\*\1/;
  $re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\?$//;
  my ($el2, $er2, $pi2, $pf2) = digest_single $_[0], $re, $_[2], $_[3]+$increment_eaten_literal_chars;

  SUBDIGEST:
  $DIGEST_SINGLE[$_[2]][$_[3]] = [$el, $er, $pi, $pf] if (defined $pi) && ($er+$el<=$er2+$el2 || !defined $pi2);
  return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
  $DIGEST_SINGLE[$_[2]][$_[3]] = [$el2, $er2, $pi2, $pf2] if (defined $pi2) && ($er2+$el2<$er+$el || !defined $pi);
  return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
  $DIGEST_SINGLE[$_[2]][$_[3]] = [];
  return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
}

sub digest_multiple {
  my $regex_num = 0;
  my $Pos_Ini;
  my $Eaten_Left;
  my $Eaten_Right;
  my $Pos_Fin;
  my $Regex_Num;

  my @DIGEST_MULTIPLE = ();
  for my $regex (@REGEX) {
    @DIGEST_SINGLE = undef;
    my @literal_chars = $regex =~ /[^.*(?:)+]/g;
    $literal_char_count = scalar @literal_chars;
    my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = digest_single $_[0], $regex, 0, 0;
    return $eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num if(($eaten_right+$eaten_left)/$literal_char_count<0.2);
    push @DIGEST_MULTIPLE, [$eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini && $pos_fin!=0;
    $regex_num++;
  }

  $regex_num = 0;
  while(@DIGEST_MULTIPLE!=0) {
    my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
    if($eaten_right+$eaten_left<$Eaten_Right+$Eaten_Left || !defined $Pos_Ini) {
      $Pos_Ini = $pos_ini;
      $Eaten_Left = $eaten_left;
      $Eaten_Right = $eaten_right;
      $Pos_Fin = $pos_fin;
      $Regex_Num = $regex_num;
    }
    $regex_num++;
  }
  return $Eaten_Left, $Eaten_Right, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
}

sub parse_programme {
  return if $_[0] eq '';
  return $_[0] if length $_[0] < 8; # <x>y</x>
  my ($eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num) = digest_multiple($_[0]);
  if(not defined $pos_ini) {
    #my $oldfh = select;
    #select STDERR;
    #print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
    #select $oldfh;
    return $_[0];
  }
  my $mask_right = @MASK[$regex_num];
  $mask_right =~ s/.*(.{$eaten_right})/\1/;
  my $mask_left = @MASK[$regex_num];
  $mask_left =~ s/(.{$eaten_left}).*/\1/;
  my $match_length = $pos_fin-$pos_ini;
  my ($target_left,$match,$target_right) = $_[0] =~ /(.{$pos_ini})(.{$match_length})(.*)/;
  shift;  # discard unused argument
  return $mask_left.$match.$mask_right, +[parse_programme $target_left], +[parse_programme $target_right], $regex_num;
}

sub printAll {
  return if @_==0;
  if (@_==1) {
    print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
    return;
  }
  printAll @{$_[1]};
  print "[ini-prod]${_[0]}[fin-prod]\n";
  printAll @{$_[2]};
}

sub parse {
  @REGEX = ();
  @MASK = ();
  my $target = shift;
  my $regex = shift;
  while($regex =~ /^(.+)$/mg) {
    my $_ = $1;
    my $other = $_;
    # strip out non-literal chars from MASK
    $other =~ s/\(\?:(.+?)\)\+/\1/g;
    $other =~ s/\(\?:(.+?)\)\?/\1/g;
    $other =~ s/\.\*\?//g;
    push @MASK, $other;
    # surround wildcards with capturing group for REGEX
    s/(\.\*\?)/\(\1\)/g;
    push @REGEX, $_;
  }

  $long_tokens = '\b' . join '\b|\b', grep length>=3,keys %{+{  map +($_=>undef), map /\w+/g, @REGEX  }};
  $long_tokens = $long_tokens . '\b';

  parse_programme $target;
}

1;

martes, 5 de noviembre de 2013

Tokens - programa en Perl para parsear XML

Sinopsis
use Tokens;

open my $regex_fh, '&lt;', $ARGV[0]; #regex.txt
open my $target_fh, '&lt;', $ARGV[1]; #file.txt

my $target;
{
  local $/ = undef;
  $target = &lt;$target_fh&gt;; #the content
  $target =~ s/\n/ /g;
}

my $regex;
{
  local $/ = undef;
  $regex = &lt;$regex_fh&gt;; #the content
}

printAll parse $target, $regex;
 

Códio (archivo: Tokens.pm):
=pod
 Copyright 2013 Gabriel Czernikier



    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see .
=cut
use strict;
#package declarations
sub digest_single;
sub parse_programme;
sub printAll;

my @token_separator = ('<', '/', '"', '=', '>', ' ');

my @REGEX;
my @MASK;
my $long_tokens;
my @DIGEST_SINGLE;

sub min_token_separator_left {
  my $regex = shift;
  my $T;
  $regex =~ /([@token_separator])/og;
  my $min = (pos $regex) - (length $1);
  my $T = $1;
  return ($T, $min) if $regex =~ m/([@token_separator])/o;
}

sub min_token_separator_right {
  my $regex = shift;
  $regex =~ /.*([@token_separator])/og;
  my $min = (pos $regex) - (length $1);
  my $T = $1;
  return ($T, $min) if $regex =~ m/([@token_separator])/o;
}

sub expand_left {
  $_[0] =~ s/^\(\?:(?:[^)]|\)[^+*])+\)\*//;
  $_[0] =~ s/^\(\?:((?:[^)]|\)[^+*])+)\)\+/\1\(\?:\1\)\*/;
}

sub trim_left {
  $_[0] =~ s/^\(\.\*\?\)//;
}

sub expand_right {
  $_[0] =~ s/\(\?:(?:[^)]|\)[^+*])+\)\*$//;
  $_[0] =~ s/\(\?:((?:[^)]|\)[^+*])+)\)\+$/\(\?:\1\)\*\1/;
}

sub trim_right {
  $_[0] =~ s/\(\.\*\?\)$//;
}

sub digest_single {
  my $target = shift;
  my $regex = shift;
  return unless $regex ne '';
  my $token_desde = shift;
  my $token_hasta = shift;
  return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
  my ($T, $pos)=min_token_separator_left($regex);
  return unless $target =~ /$regex/ or defined $pos;
  $target =~ /($regex)/g;
  my $digit = 2;
  while( my $cg = eval '$'.$digit++ ) {
    goto VISITING if $cg =~ /$long_tokens/;
  }
  my $pff = (pos $target);
  my $pii = $pff - (length $1) if defined $pff;
  VISITING:
  return ($token_desde, $token_hasta, $pii, $pff) unless defined $pos;
  my $re = $regex;
  expand_left $re;
  ($T, $pos)=min_token_separator_left($re);
  my $pos_aux = $pos+length($T);
  $re =~ s/.{$pos_aux}//;
  trim_left $re;
  my ($td, $th, $pi, $pf) = digest_single $target, $re, $token_desde+1, $token_hasta;
  $re = $regex;
  expand_right $re;
  ($T, $pos) = min_token_separator_right($re);
  $re =~ s/(.{$pos}).*/\1/;
  trim_right $re;
  my ($td2, $th2, $pi2, $pf2) = digest_single $target, $re, $token_desde, $token_hasta-1;
  $DIGEST_SINGLE[$token_desde][$token_hasta] = [$token_desde, $token_hasta, $pii, $pff] if (defined $pii) && ($pff-$pii>=$pf-$pi || !defined $pi) && ($pff-$pii>=$pf2-$pi2 || !defined $pi2);
  return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
  $DIGEST_SINGLE[$token_desde][$token_hasta] = [$td, $th, $pi, $pf] if (defined $pi) && ($pf-$pi>$pff-$pii || !defined $pii) && ($pf-$pi>=$pf2-$pi2 || !defined $pi2);
  return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
  $DIGEST_SINGLE[$token_desde][$token_hasta] = [$td2, $th2, $pi2, $pf2] if (defined $pi2) && ($pf2-$pi2>$pff-$pii || !defined $pii) && ($pf2-$pi2>$pf-$pi || !defined $pi);
  return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
  $DIGEST_SINGLE[$token_desde][$token_hasta] = [];
  return @{$DIGEST_SINGLE[$token_desde][$token_hasta]};
}

sub token_count {
  my $regex = shift;
  my @matches = $regex =~ /[@token_separator]/og;
  return scalar @matches;
}

sub digest_multiple {
  my $target = shift;
  my @R = @REGEX;
  my $regex_num = 0;
  my $Pos_Ini;
  my $Token_Desde;
  my $Token_Hasta;
  my $Pos_Fin;
  my $Regex_Num;

  my @DIGEST_MULTIPLE = ();
  while( defined(my $regex = shift @R)) {
    @DIGEST_SINGLE = undef;
    my ($token_desde, $token_hasta, $pos_ini, $pos_fin) = digest_single $target, $regex, 0, token_count($regex);
    return $token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num if($token_desde==0 && $token_hasta==token_count($regex));
    push @DIGEST_MULTIPLE, [$token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini;
    $regex_num++;
  }

  $regex_num = 0;
  while(@DIGEST_MULTIPLE!=0) {
    my ($token_desde, $token_hasta, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
    if(defined $pos_ini && $pos_fin!=0 && ($pos_ini<$Pos_Ini || !defined $Pos_Ini)) {
      $Pos_Ini = $pos_ini;
      $Token_Desde = $token_desde;
      $Token_Hasta = $token_hasta;
      $Pos_Fin = $pos_fin;
      $Regex_Num = $regex_num;
    }
    $regex_num++;
  }
  return $Token_Desde, $Token_Hasta, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
}

sub parse_programme {
  my $target = shift;
  return if $target eq '';
  my ($token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num) = digest_multiple($target);
  if(not defined $pos_ini) {
    #my $oldfh = select;
    #select STDERR;
    #print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
    #select $oldfh;
    return $target;
  }
  my $mask_right = @MASK[$regex_num];
  my $repetitions = token_count($mask_right)-$token_hasta;
  $mask_right =~ s/.*((?:[@token_separator][^@token_separator]*){$repetitions})$/\1/;
  my $mask_left = @MASK[$regex_num];
  $mask_left =~ s/^((?:[^@token_separator]*[@token_separator]){$token_desde}).*/\1/;
  my $match = $target;
  $match =~ s/(.{$pos_fin}).*/\1/;
  $match =~ s/.{$pos_ini}//;
  my $produccion = $mask_left . $match . $mask_right;
  $target =~ /(.{$pos_ini})/;
  my $target_left = $1;
  $target =~ /.{$pos_fin}(.*)/;
  my $target_right = $1;
  return $produccion, +[parse_programme $target_left], +[parse_programme $target_right];
}

sub printAll {
  return if @_==0;
  if (@_==1) {
    print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
    return;
  }
  printAll @{$_[1]};
  print "[ini-prod]${_[0]}[fin-prod]\n";
  printAll @{$_[2]};
}

sub parse {
  my $target = shift;
  my $regex = shift;
  while($regex =~ /^(.+)$/mg) {
    my $_ = $1;
    my $other = $_;
    $other =~ s/\(\?:(.+?)\)\+/\1/g;
    $other =~ s/\.\*\?//g;
    push @MASK, $other;
    s/(\.\*\?)/\(\1\)/g;
    push @REGEX, $_;
  }

  $long_tokens = join '|', grep length>=3,keys %{+{  map +($_=>undef), map /\w+/g, @REGEX  }};

  parse_programme $target;
}

1;



Input: 

regex2-group.txt
&lt;MyTag1 myAttr1=".*?" myAttr2=".*?"&gt;(?:&lt;MyTag2 myAttr3=".*?" myAttr4=".*?"&gt;.*?&lt;/MyTag2&gt;)+&lt;/MyTag1&gt;

regex3-multiple.txt
&lt;MyTag1 myAttr1=".*?" myAttr2=".*?"&gt;(?:&lt;MyTag2 myAttr3=".*?" myAttr4=".*?"&gt;.*?&lt;/MyTag2&gt;)+&lt;/MyTag1&gt;
&lt;Foo myFoo=".*?" myFoo2=".*?"&gt;&lt;Bar1 myBar1=".*?" myBar1b=".*?" myBar1c=".*?"&gt;&lt;/Bar1&gt;&lt;Bar2 myBar2=".*?" myBar2b=".*?"&gt;&lt;Bar2Sub1&gt;.*?&lt;/Bar2Sub1&gt;&lt;/Bar2&gt;&lt;/Foo&gt;
&lt;MyA myA=".*?"&gt;&lt;MyAB&gt;(?:&lt;MyB myB=".*?"&gt;&lt;/MyB&gt;)+&lt;/MyAB&gt;&lt;MyAC&gt;(?:&lt;MyC myC=".*?"&gt;&lt;/MyC&gt;)+&lt;/MyAC&gt;&lt;/MyA&gt;
&lt;TagT attrT=".*?"&gt;.*?&lt;/TagT&gt;

file2-broken-left-broken-right-repetitions.txt
Tag2 myAttr3="myVal3Pre" myAttr4="myVal4Pre"&gt;MyText1Pre&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3" myAttr4="myVal4"&gt;MyText1&lt;/MyTag2&gt;&lt;/MyTag1&gt;&lt;MyTag1 myAttr1="myVal1b" myAttr2="myVal2b"&gt;&lt;MyTag2 myAttr3="myVal3b" myAttr4="myVal4b"&gt;MyText1b&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3bPos" myAttr4="myVal4bPos"&gt;MyText1bPos&lt;/MyTag2&gt;&lt;/MyTag1&gt;&lt;MyTag1 myAttr1="myVal1c" myAttr2="myVal2c"&gt;&lt;MyTag2 myAttr3="myVal3c" myAttr4="myVal4c"&gt;MyText1c&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3cPos" myAtt

file3-broken-left-broken-right-repetitions-multiple.txt
Tag2 myAttr3="myVal3Pre" myAttr4="myVal4Pre"&gt;MyText1Pre&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3" myAttr4="myVal4"&gt;MyText1&lt;/MyTag2&gt;&lt;/MyTag1&gt;&lt;Foo myFoo="foo-123" myFoo2="foo-456"&gt;&lt;Bar1 myBar1="bar-123" myBar1b="bar-456" myBar1c="bar-789"&gt;&lt;/Bar1&gt;&lt;Bar2 myBar2="bar-135" myBar2b="bar-790"&gt;&lt;Bar2Sub1&gt;BarSubText&lt;/Bar2Sub1&gt;&lt;/Bar2&gt;&lt;/Foo&gt;&lt;MyTag1 myAttr1="myVal1b" myAttr2="myVal2b"&gt;&lt;MyTag2 myAttr3="myVal3b" myAttr4="myVal4b"&gt;MyText1b&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3bPos" myAttr4="myVal4bPos"&gt;MyText1bPos&lt;/MyTag2&gt;&lt;/MyTag1&gt;&lt;MyA myA="MYa-123"&gt;&lt;MyAB&gt;&lt;MyB myB="MYb-123"&gt;&lt;/MyB&gt;&lt;MyB myB="MYb-456"&gt;&lt;/MyB&gt;&lt;MyB myB="MYb-123-bis"&gt;&lt;/MyB&gt;&lt;/MyAB&gt;&lt;MyAC&gt;&lt;MyC myC="mycR"&gt;&lt;/My M&amp;; &lt;MyTag1 myAttr1="myVal1c" myAttr2="myVal2c"&gt;&lt;MyTag2 myAttr3="myVal3c" myAttr4="myVal4c"&gt;MyText1c&lt;/MyTag2&gt;&lt;MyTag2 myAttr3="myVal3cPos" myAtt  &lt;TagT attrT="valueT"&gt;TextT-pr