Parsing SVG paths

TL;DR

Reinventing wheels: parsing the d attribute of paths in SVG.

I know, I know.

There is Image::SVG::Path on CPAN that does exactly this. But how hard can it be?!?.

Well… a bit, indeed. But now it’s (mostly) in the past, so we can enjoy an intermediate-though-working byproduct, parsth:


#!/usr/bin/env perl
use 5.024;
use warnings;
use experimental qw< postderef signatures >;
no warnings qw< experimental::postderef experimental::signatures >;

use Data::Dumper;
$Data::Dumper::Indent = 1;
say Dumper svg_path(shift // 'M0,0T10,10T10,0Z');

sub svg_path ($d) {
   state $parser = pf_PARSE(svg_path_er());
   return $parser->($d);
}

sub svg_path_er {
   state $retval = sub ($rtext) {
      return [] unless $$rtext =~ m{\S}mxs; # empty path

      pf_ws()->($rtext); # ignore initial spaces, if any
      my $ast = pf_sequence(
         svg_moveto(),
         pf_repeated(svg_drawto(), 0, -1), # *
      )->($rtext);
      #say Dumper $ast; exit 1;

      # now "expand" all sub-commands with the respective sequences
      my @retval = map {
         my ($command, $sequence) = $_->@*;
         my $lc = ($command eq 'm') ? 'l' # later command
            : ($command eq 'M') ? 'L'
            : $command;
         map {
            (my $retval, $command) = ({$_->%*, command => $command}, $lc);
            $retval;
         } ($sequence || [{}])->@*;
      } ($ast->[0], $ast->[1]->@*);
      return \@retval;
   };
}

sub svg_cmd ($letter, $item = undef) {
   my $rx = pf_regexp(qr{(?imxs:\s*([$letter])\s*)});
   return pf_sequence(
      sub { my $m = $rx->($_[0]) or return; return $m->[0] }, # "unwrap"
      ($item ? pf_list($item, comma_wsp()) : ()),
   );
}

sub svg_drawto {
   state $retval;
   return $retval if defined $retval;

   $retval = pf_alternatives (
      svg_moveto(),
      svg_cmd('z'), # closepath
      svg_cmd(l => coordinate_pair('target')), # lineto
      svg_cmd(h => coordinate('target')), # horizontal_lineto
      svg_cmd(v => coordinate('target')), # vertical_lineto
      svg_cmd(c => coordinate_pair_triplet(qw< cp1 cp2 target >)), # curveto
      svg_cmd(s => coordinate_pair_double(qw< cp2 target >)), # smooth_curveto
      svg_cmd(q => coordinate_pair_double(qw< cp target >)), # quadratic_bezier_curveto
      svg_cmd(t => coordinate_pair('target')), # smooth_quadratic_bezier_curveto
      svg_cmd(a => elliptical_arc_argument()), # elliptical_arc
   );
}

sub svg_moveto { svg_cmd(m => coordinate_pair('target')) }

sub comma_wsp { state $r = pf_regexp(qr{(?mxs:(\s*,\s*|\s+))}) }
sub coordinate ($name = 'whatever') {
   state $matcher = pf_regexp(qr{(?mxs:([-+]?[0-9]+))});
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return {$name => $match->[0]};
   };
}

sub args_list (@args) {
   my @indexes = map { 2 * $_ } 0 .. $#args;
   (undef, @args) = map {(comma_wsp(), $_)} @args;
   my $matcher = pf_sequence(@args);
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { map { $_->%* } $match->@[@indexes]};
   };
}

sub coordinate_pair ($name) {
   state $matcher = args_list(
      coordinate('x'),
      coordinate('y'),
   );
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { $name => $match };
   };
}

sub coordinate_pair_double ($name_a, $name_b) {
   my $matcher = args_list(
      coordinate_pair($name_a),
      coordinate_pair($name_b),
   );
}

sub coordinate_pair_triplet ($name_a, $name_b, $name_c) {
   return args_list(
      coordinate_pair($name_a),
      coordinate_pair($name_b),
      coordinate_pair($name_c),
   );
}

sub named_regexper ($name, $regexp) {
   my $matcher = pf_regexp($regexp);
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      return { $name => $match->[0] };
   };
}

sub elliptical_arc_argument {
   my $number = qr{(?mxs:(\d+))};
   my $flag   = qr{(?mxs:([01]))};
   my $matcher = args_list(
      named_regexper(rx => $number),
      named_regexper(ry => $number),
      named_regexper('x-axis-rotation' => $number),
      named_regexper('large-arc-flag' => $flag),
      named_regexper('sweep-flag' => $flag),
      coordinate_pair('target'),
   );
   return sub ($rtext) {
      my $match = $matcher->($rtext) or return;
      $match->{radii} = {
         x => delete($match->{rx}),
         y => delete($match->{ry}),
      };
      return $match;
   };
}


sub pf_alternatives {
   my (@A, $r) = @_;
   return sub { (defined($r = $_->($_[0])) && return $r) for @A; return };
}

sub pf_exact {
   my ($wlen, $what, @retval) = (length($_[0]), @_);
   unshift @retval, $what unless scalar @retval;
   return sub {
      my ($rtext, $pos) = ($_[0], pos ${$_[0]});
      return if length($$rtext) - $pos < $wlen;
      return if substr($$rtext, $pos, $wlen) ne $what;
      pos($$rtext) = $pos + $wlen;
      return [@retval];
   };
}

sub pf_list {
   my ($w, $s, $sep_as_last) = @_; # (what, separator, sep_as_last)
   $s = pf_exact($s) if defined($s) && !ref($s);
   return sub {
      defined(my $base = $w->($_[0])) or return;
      my $rp = sub { return ($s && !($s->($_[0])) ? () : $w->($_[0])) };
      my $rest = pf_repeated($rp)->($_[0]);
      $s->($_[0]) if $s && $sep_as_last; # attempt last separator?
      unshift $rest->@*, $base;
      return $rest;
   };
}

sub pf_PARSE {
   my ($expression) = @_;
   return sub {
      my $rtext = ref $_[0] ? $_[0] : \$_[0]; # avoid copying
      my $ast = $expression->($rtext) or die "nothing parsed\n";
      my $pos = pos($$rtext) || 0;
      my $delta = length($$rtext) - $pos;
      return $ast if $delta == 0;
      my $offending = substr $$rtext, $pos, 72;
      substr $offending, -3, 3, '...' if $delta > 72;
      die "unknown sequence starting at $pos <$offending>\n";
   };
}

sub pf_regexp {
   my ($rx, @forced_retval) = @_;
   return sub {
      my (undef, $retval) = ${$_[0]} =~ m{\G()$rx}cgmxs or return;
      return scalar(@forced_retval) ? [@forced_retval] : [$retval];
   };
}

sub pf_repeated { # *(0,-1) ?(0,1) +(1,-1) {n,m}(n,m)
   my ($w, $m, $M) = ($_[0], $_[1] || 0, (defined($_[2]) ? $_[2] : -1));
   return sub {
      my ($rtext, $pos, $lm, $lM, @retval) = ($_[0], pos ${$_[0]}, $m, $M);
      while ($lM != 0) { # lm = local minimum, lM = local maximum
         defined(my $piece = $w->($rtext)) or last;
         $lM--;
         push @retval, $piece;
         if ($lm > 0) { --$lm } # no success yet
         else         { $pos = pos $$rtext } # ok, advance
      }
      pos($$rtext) = $pos if $lM != 0;  # maybe "undo" last attempt
      return if $lm > 0;    # failed to match at least $min
      return \@retval;
   };
}

sub pf_sequence {
   my @items = map { ref $_ ? $_ : pf_exact($_) } @_;
   return sub {
      my ($rtext, $pos, @rval) = ($_[0], pos ${$_[0]});
      for my $item (@items) {
         if (defined(my $piece = $item->($rtext))) { push @rval, $piece }
         else { pos($$rtext) = $pos; return } # failure, revert back
      }
      return \@rval;
   };
}

{ my $r; sub pf_ws  { $r ||= pf_regexp(qr{(\s+)}) } }
{ my $r; sub pf_wso { $r ||= pf_regexp(qr{(\s*)}) } }

I try to follow the grammar as much as possible, taking shortcuts here and there. The last part should remind of what described in Parsing toolkit in cglib.

We are converging…


Comments? Octodon, , GitHub, Reddit, or drop me a line!