Parsing SVG paths

TL;DR

Reinventing wheels: parsing the d attribute of paths in SVG.

I know, I know.

There is Image::SVG::Path on CPAN that does exactly this. But how hard can it be?!?.

Well… a bit, indeed. But now it’s (mostly) in the past, so we can enjoy an intermediate-though-working byproduct, parsth:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
#!/usr/bin/env perl
use 5.024;
use warnings;
use experimental qw< postderef signatures >;
no warnings qw< experimental::postderef experimental::signatures >;

use Data::Dumper;
$Data::Dumper::Indent = 1;
say Dumper svg_path(shift // 'M0,0T10,10T10,0Z');

sub svg_path ($d) {
   state $parser = pf_PARSE(svg_path_er());
   return $parser->($d);
}

sub svg_path_er {
   state $retval = sub ($rtext) {
      return [] unless $$rtext =~ m{\S}mxs; # empty path

      pf_ws()->($rtext); # ignore initial spaces, if any
      my $ast = pf_sequence(
         svg_moveto(),
         pf_repeated(svg_drawto(), 0, -1), # *
      )->($rtext);
      #say Dumper $ast; exit 1;

      # now "expand" all sub-commands with the respective sequences
      my @retval = map {
         my ($command, $sequence) = $_->@*;
         my $lc = ($command eq 'm') ? 'l' # later command
            : ($command eq 'M') ? 'L'
            : $command;
         map {
            (my $retval, $command) = ({$_->%*, command => $command}, $lc);
            $retval;
         } ($sequence || [{}])->@*;
      } ($ast->[0], $ast->[1]->@*);
      return \@retval;
   };
}

sub svg_cmd ($letter, $item = undef) {
   my $rx = pf_regexp(qr{(?imxs:\s*([$letter])\s*)});
   return pf_sequence(
      sub { my $m = $rx->($_[0]) or return; return $m->[0] }, # "unwrap"
      ($item ? pf_list($item, comma_wsp()) : ()),
   );
}

sub svg_drawto {
   state $retval;
   return $retval if defined $retval;

   $retval = pf_alternatives (
      svg_moveto(),
      svg_cmd('z'), # closepath
      svg_cmd(l => coordinate_pair('target')), # lineto
      svg_cmd(h => coordinate('target')), # horizontal_lineto
      svg_cmd(v => coordinate('target')), # vertical_lineto
      svg_cmd(c => coordinate_pair_triplet(qw< cp1 cp2 target >)), # curveto
      svg_cmd(s => coordinate_pair_double(qw< cp2 target >)), # smooth_curveto
      svg_cmd(q => coordinate_pair_double(qw< cp target >)), # quadratic_bezier_curveto
      svg_cmd(t => coordinate_pair('target')), # smooth_quadratic_bezier_curveto
      svg_cmd(a => elliptical_arc_argument()), # elliptical_arc
   );
}

sub svg_moveto { svg_cmd(m => coordinate_pair('target')) }

sub comma_wsp { state $r = pf_regexp(qr{(?mxs:(\s*,\s*|\s+))}) }
sub coordinate ($name = 'whatever') {
   state $matcher = pf_regexp(qr{(?mxs:([-+]?[0-9]+))});
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return {$name => $match->[0]};
   };
}

sub args_list (@args) {
   my @indexes = map { 2 * $_ } 0 .. $#args;
   (undef, @args) = map {(comma_wsp(), $_)} @args;
   my $matcher = pf_sequence(@args);
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { map { $_->%* } $match->@[@indexes]};
   };
}

sub coordinate_pair ($name) {
   state $matcher = args_list(
      coordinate('x'),
      coordinate('y'),
   );
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { $name => $match };
   };
}

sub coordinate_pair_double ($name_a, $name_b) {
   my $matcher = args_list(
      coordinate_pair($name_a),
      coordinate_pair($name_b),
   );
}

sub coordinate_pair_triplet ($name_a, $name_b, $name_c) {
   return args_list(
      coordinate_pair($name_a),
      coordinate_pair($name_b),
      coordinate_pair($name_c),
   );
}

sub named_regexper ($name, $regexp) {
   my $matcher = pf_regexp($regexp);
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { $name => $match->[0] };
   };
}

sub elliptical_arc_argument {
   my $number = qr{(?mxs:(\d+))};
   my $flag   = qr{(?mxs:([01]))};
   my $matcher = args_list(
      named_regexper(rx => $number),
      named_regexper(ry => $number),
      named_regexper('x-axis-rotation' => $number),
      named_regexper('large-arc-flag' => $flag),
      named_regexper('sweep-flag' => $flag),
      coordinate_pair('target'),
   );
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      $match->{radii} = {
         x => delete($match->{rx}),
         y => delete($match->{ry}),
      };
      return $match;
   };
}


sub pf_alternatives {
   my (@A, $r) = @_;
   return sub { (defined($r = $_->($_[0])) && return $r) for @A; return };
}

sub pf_exact {
   my ($wlen, $what, @retval) = (length($_[0]), @_);
   unshift @retval, $what unless scalar @retval;
   return sub {
      my ($rtext, $pos) = ($_[0], pos ${$_[0]});
      return if length($$rtext) - $pos < $wlen;
      return if substr($$rtext, $pos, $wlen) ne $what;
      pos($$rtext) = $pos + $wlen;
      return [@retval];
   };
}

sub pf_list {
   my ($w, $s, $sep_as_last) = @_; # (what, separator, sep_as_last)
   $s = pf_exact($s) if defined($s) && !ref($s);
   return sub {
      defined(my $base = $w->($_[0])) or return;
      my $rp = sub { return ($s && !($s->($_[0])) ? () : $w->($_[0])) };
      my $rest = pf_repeated($rp)->($_[0]);
      $s->($_[0]) if $s && $sep_as_last; # attempt last separator?
      unshift $rest->@*, $base;
      return $rest;
   };
}

sub pf_PARSE {
   my ($expression) = @_;
   return sub {
      my $rtext = ref $_[0] ? $_[0] : \$_[0]; # avoid copying
      my $ast = $expression->($rtext) or die "nothing parsed\n";
      my $pos = pos($$rtext) || 0;
      my $delta = length($$rtext) - $pos;
      return $ast if $delta == 0;
      my $offending = substr $$rtext, $pos, 72;
      substr $offending, -3, 3, '...' if $delta > 72;
      die "unknown sequence starting at $pos <$offending>\n";
   };
}

sub pf_regexp {
   my ($rx, @forced_retval) = @_;
   return sub {
      my (undef, $retval) = ${$_[0]} =~ m{\G()$rx}cgmxs or return;
      return scalar(@forced_retval) ? [@forced_retval] : [$retval];
   };
}

sub pf_repeated { # *(0,-1) ?(0,1) +(1,-1) {n,m}(n,m)
   my ($w, $m, $M) = ($_[0], $_[1] || 0, (defined($_[2]) ? $_[2] : -1));
   return sub {
      my ($rtext, $pos, $lm, $lM, @retval) = ($_[0], pos ${$_[0]}, $m, $M);
      while ($lM != 0) { # lm = local minimum, lM = local maximum
         defined(my $piece = $w->($rtext)) or last;
         $lM--;
         push @retval, $piece;
         if ($lm > 0) { --$lm } # no success yet
         else         { $pos = pos $$rtext } # ok, advance
      }
      pos($$rtext) = $pos if $lM != 0;  # maybe "undo" last attempt
      return if $lm > 0;    # failed to match at least $min
      return \@retval;
   };
}

sub pf_sequence {
   my @items = map { ref $_ ? $_ : pf_exact($_) } @_;
   return sub {
      my ($rtext, $pos, @rval) = ($_[0], pos ${$_[0]});
      for my $item (@items) {
         if (defined(my $piece = $item->($rtext))) { push @rval, $piece }
         else { pos($$rtext) = $pos; return } # failure, revert back
      }
      return \@rval;
   };
}

{ my $r; sub pf_ws  { $r ||= pf_regexp(qr{(\s+)}) } }
{ my $r; sub pf_wso { $r ||= pf_regexp(qr{(\s*)}) } }

I try to follow the grammar as much as possible, taking shortcuts here and there. The last part should remind of what described in Parsing toolkit in cglib.

We are converging…


Comments? Octodon, , GitHub, Reddit, or drop me a line!