Article 7717 of comp.lang.perl: Xref: feenix.metronet.com comp.lang.perl:7717 Newsgroups: comp.lang.perl Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!pipex!uunet!munnari.oz.au!metro!usage!news From: cameron@cse.unsw.edu.au (Cameron Simpson) Subject: Re: Stumped at To: header parsing Message-ID: To: shutton@copper.ucs.indiana.edu (Scott K. Hutton) Followup-To: comp.lang.perl Sender: news@usage.csd.unsw.OZ.AU Nntp-Posting-Host: fuligin.spectrum.cs.unsw.oz.au Reply-To: cameron@cse.unsw.edu.au Organization: CS&E Computing Facility, Uni Of NSW, Oz References: Errors-To: cameron@cse.unsw.edu.au Date: Mon, 8 Nov 1993 12:53:54 GMT Return-Receipt-To: cameron@cse.unsw.edu.au Lines: 330 | I'm at a loss for an efficient way to parse a nasty header in the form | of: | | To: foobar@baz.com "Frank Oobar, Director", quux@biff.bitnet (BIFF) | | The line needs to be split into its component addresses, but it should | never be split on a comma that occurs within quotes or parens. I | can't figure out a pattern that will work for this and might be forced | to scan the line for commas and figuring out if we're in quotes/parens | or not. | | Surely someone out there has already invented this wheel... | | Be happy to share a smattering of other mail parsing code in return | (or, for that matter, even if you can't help). I'm working up a | mail parsing package, since that's what I seem to do the most. | | -Scott I append my rfc822.pl package. Let me know if it uses things I hacen't included, or of any bugs you find. What you want are &parseaddrs and &rawaddrs. - Cameron Simpson cameron@cse.unsw.edu.au, DoD#743 -- Hacker: One who accidentally destroys. Wizard: One who recovers afterwards. #!/bin/sh # sed 's/^X//' > rfc822.pl <<'EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl' X#!/usr/local/bin/perl X# X# Code to support RFC822-style message headers. X# X# &clrhdrs X# Empty %'hdrs and @'hdrs. X# X# &hdrkey Field name to key for array. X# &hdrnorm Field name to output form (capitalise words). X# X# &hdr(name) -> @values or undef X# Returns text of header as list (if accomodating multiple entries) X# or the list joined with "\t\n" or ", " if in scalar context. X# X# &addhdrs(@message_lines) X# Extract leading header lines from @lines up to blank line if present X# and add to @'hdrs. Then rebuild %'hdrs to be the content of each line X# keyed by downcased field name (and _ -> -). Multiple bodies are joined X# by ", " if in $rfc822'listfieldptn or by "\t\n" otherwise. Returns the X# unprocessed lines. X# X# &delhdrs(@field_names) X# Remove all references to the specified headers from @'hdrs and %'hdrs. X# X# &synchdrs X# Rebuild %'hdrs from @'hdrs. X# X# &parseaddrs($addresslist) -> @(precomment, address, postcomment) X# Break comma separated address list into a list of tuples, X# being leading comment, address portion, trailing comment. X# X# &rawaddrs(@(pre,addr,post)) -> @addrs X# Extract the middle elements from a 3-tuple list. X# X# &msgid X# Generate a message-id for an article. X# X Xpackage rfc822; X X@mailhdrs=('to','cc','bcc','from','sender','reply-to','return-receipt-to', X 'errors-to'); X@newshdrs=('newsgroups','followup-to'); X$mailptn=join('|',@mailhdrs); X$newsptn=join('|',@newshdrs); X X@listhdrs=(@mailhdrs,@newshdrs,'keywords'); X$listfieldptn=join('|',@listhdrs); X X&clrhdrs; X Xsub clrhdrs X { undef %'hdrs, @'hdrs; X $synced=1; X } X Xsub hdr X { local($_)=@_; X X $_=&hdrkey($_); X X &synchdrs; X X return undef unless defined($'hdrs{$_}); X X return $'hdrs{$_} unless wantarray; X X local(@bodies)=(); X X for $hdr (@'hdrs) X { push(@bodies,$') if $hdr =~ /^$_:\s*/; X } X X @bodies; X } X X# add a line to @hdrs and %hdrs X# just adds to @hdrs until it gets "" or "\n" and then sets up %hdrs X# field names matching $commaptn are concatenated with commas, X# otherwise with "\n\t". Xsub addhdrs # @lines -> @remaining_lines X { local($commaptn)=$listfieldptn; X local($_,$hdr); X X $hdr=''; X while (defined($_=shift)) X { s/\r?\n$//; X last if !length; X X if (/^\s/) X { $hdr.="\n$_"; X } X else X { length($hdr) && push(@'hdrs,$hdr); X $hdr=$_; X } X } X X length($hdr) && push(@'hdrs,$hdr); X X $synced=0; X X @_; X } X X# delete all instances of specified headers Xsub delhdrs # (@fieldnames) -> void X { local(@fields)=@_; X local($field,@newhdrs); X X for $field (@fields) X { $field=&hdrkey($field); X @hdrs=eval 'grep(!/^$field:/o,@hdrs)'; X } X X $synced=0; X } X X# replace header lines Xsub rephdrs # (@headerlines) -> void X { local(@reps)=@_; X local($_,$field); X X for (@reps) X { next unless /^([-\w]+):/; X $field=&hdrkey($1); X X for $hdr (@'hdrs) X { $hdr =~ s/^$field:/X-Original-$&/i; X } X } X X $synced=0; X X &addhdrs(@reps); X } X X# Get key from field. Xsub hdrkey X { local($_)=@_; X tr/_A-Z/-a-z/; X $_; X } X X# Get normal form of field name. Xsub hdrnorm X { local($_)=&hdrkey($_[0]); X X print STDERR "norm($_) -> "; X s/\b[a-z]/\u$&/g; X print STDERR "$_\n"; X $_; X } X X# Rebuild %'hdrs from @'hdrs. Xsub synchdrs # (void) -> (void) X { return if $synced; X X local($key,$field,$_); X X undef %'hdrs; X for (@'hdrs) X { if (/^([^\s:]+):\s*/) X { $key=$1; X $field=$'; $field =~ s/^\s+//; X $key=&hdrkey($key); X if (defined($'hdrs{$key})) X { if ($key =~ /^$commaptn$/o) X { $'hdrs{$key}.=', '; X } X else X { $'hdrs{$key}.="\n\t"; X } X X $'hdrs{$key}.=$field; X } X else X { $'hdrs{$key}=$field; X } X } X } X X $synced=1; X } X X# parse an RFC822 address list returning a list of tuples X# (leading comment, address, trailing comment, ...) Xsub parseaddrs # (addrlist) -> @(precomment, addr, postcomment) X { local($_)=@_; X local(@parsed); X local($pre,$addr,$post)=('','',''); X X s/^\s+//; X while (length) X { if (/^,/) X # end of currently building address X { $_=$'; X if (length($pre) && !length($addr)) X { $addr=$pre; $pre=''; X } X X if (length($pre) || length($addr) || length($post)) X { push(@parsed,$pre,$addr,$post); X } X X $pre=''; X $addr=''; X $post=''; X } X elsif (!length($addr) && /^[-\w_.]+(@[-\w_.]+)?/) X { $_=$'; X $addr=$&; X } X elsif (/^"([^"]|\\")*"/ || /^'([^']|\\')*'/) X { $_=$'; X if (length($addr)) X { $post .= " $&"; X } X else X { $pre .= " $&"; X } X } X elsif (/^<[^>\s]*>/) X { $_=$'; X if (length($addr)) X { $pre.=' '.$addr; X if (length($post)) X { $pre.=' '.$post; X $post=''; X } X } X X $addr=$&; X } X elsif (/^[^,\s]+/) X { $_=$'; X if (length($addr)) X { $post.=" $&"; X } X else X { $pre.=" $&"; X } X } X else X { print STDERR "trouble parsing, remaining address is \"$_\"\n"; X } X X s/^\s+//; X } X X if (length($pre) && !length($addr)) X { $addr=$pre; $pre=''; X } X X if (length($pre) || length($addr) || length($post)) X { push(@parsed,$pre,$addr,$post); X } X X for (@parsed) X { s/^\s+//; X } X X @parsed; X } X X# strip out the raw addresses from the result of &parseaddrs Xsub rawaddrs X { local(@rawaddrs); X local($_); X X while (defined($_=shift)) X { $_=shift; X last if !defined; X s/^<([^>\s]+)>$/$1/; X push(@rawaddrs,$_); X shift; X } X X @rawaddrs; X } X X$msgid_count=0; Xsub msgid X { local($sec,$min,$hour,$mday,$mon,$year,@etc)=localtime(time); X X $msgid_count++; X sprintf("<%s-%02d%02d%02d%02d%02d%02d-%d-%05d@%s>", X $'USER, X $year,$mon+1,$mday,$hour,$min,$sec, X $msgid_count, X $$, X $'HOSTNAME); X } X X1; # for require EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl exit 0 .