# RFC2396.pm - Syntax for URI (RFC2396/RFC2368/RFC1738/RFC822)
#
#    0.0  1998/10/03
#    0.1  1999/01/19
#    0.2  1999/03/04 RFC2368
#    0.3  1999/07/24 fixed(?) bug in mailto:
#    0.40 1999/08/28 tune up regex / use Exporter / expire parse_URI
#    0.41 1999/12/05 fixed(?) bug in URL_mailto
#    0.42 1999/12/12 use Email:Valid
#    0.43 2000/04/22 fixed bug in news:
#
# by ISHINO Keiichiro <k16@chiba.email.ne.jp>

package RFC2396;
require 5.002;

BEGIN {
  use Exporter;
  @ISA    = qw(Exporter);
  @EXPORT = qw($URI_reference &URI_reference
               $URI_parsing
               $URL_ftp       &URL_ftp
               $URL_file      &URL_file
               $URL_http      &URL_http
               $URL_gopher    &URL_gopher
               $URL_mailto    &URL_mailto
               $URL_news      &URL_news
               $URL_nntp      &URL_nntp
               $URL_telnet    &URL_telnet
               $URL_wais      &URL_wais
               $URL_prospero  &URL_prospero
               );
}

$VERSION = '0.43';

#$lowalpha     = '[a-z]';
#$upalpha      = '[A-Z]';
$_digit        = '0-9';             # char class
$_alpha        = 'A-Za-z';          # char class
$_alphanum     = $_digit.$_alpha;   # char class
$digit         = '['.$_digit.']';
$alpha         = '['.$_alpha.']';
$alphanum      = '['.$_alphanum.']';

$hex           = '[0-9A-Fa-f]';
$hex2          = $hex.$hex;
$escaped       = '\%'.$hex2;

$control       = '[^\x20-\x7E]';
$_space        = '\x20';            # char class
$_delim        = '<>#%"';           # char class
$_unwise       = '{}|\x5C^\[\]`';   # char class
$spdelimunwise = '['.$_space.$_delim.$_unwise.']';
$delimunwise   = '['.$_delim.$_unwise.']';

$_mark         = '\-_.!~*\'()';     # char class
$_unreserved   = $_alphanum.$_mark; # char class
$_reserved     = ';/?:@&=+$,';      # char class
$uric          = '(?:['.$_unreserved.$_reserved.']|'.$escaped.')';
$uric_no_slash = '(?:['.$_unreserved.';?:@&=+$,]|'.$escaped.')';

$fragment      = $uric.'*';
$query         = $uric.'*';

$pchar         = '(?:['.$_unreserved.':@&=+$,]|'.$escaped.')';
$param         = $pchar.'+';
$segment       = '(?:'.$pchar.'+(?:;'.$param.')*|(?:;'.$param.')+)';
$path_segments = $segment.'(?:/(?:'.$segment.')?)*';
$rel_segment   = '(?:['.$_unreserved.';@&=+$,]|'.$escaped.')+';

$IPv4part      = '(?:[01]?\d\d?|2[0-4]\d|25[0-5])';
$IPv4address   = $IPv4part.'\.'.$IPv4part.'\.'.$IPv4part.'\.'.$IPv4part;
$toplabel      = $alpha.'(?:['.$_alphanum.'\-]*'.$alphanum.')?';
$domainlabel   = $alphanum.'(?:['.$_alphanum.'\-]*'.$alphanum.')?';
$hostname      = '(?:'.$domainlabel.'\.)*(?:'.$toplabel.')\.?';
$host          = $hostname.'|'.$IPv4address;
$port          = $digit.'*';
$hostport      = '(?:'.$host.')(?::'.$port.')?';

$reg_name      = '(?:['.$_unreserved.';:@&=+$,]|'.$escaped.')+';
$userinfo      = '(?:['.$_unreserved.';:&=+$,]|'.$escaped.')+';
$server        = '(?:'.$userinfo.'\@)?'.$hostport;
$authority     = '(?:'.$server.'|'.$reg_name.')';

$scheme        = $alpha.'['.$_alphanum.'+\-.]*';

$abs_path      = '/(?:'.$path_segments.')?';
$net_path      = '//(?:'.$authority.'(?:'.$abs_path.')?|'.$abs_path.')';
$rel_path      = $rel_segment.'(?:'.$abs_path.')?';

$hier_part     = '(?:'.$net_path.'|'.$abs_path.')(?:\?'.$query.')?';
$opaque_part   = $uric_no_slash.$uric.'*';

$absoluteURI   = '(?:'.$scheme.':(?:'.$hier_part.'|'.$opaque_part.'))';
$relativeURI   = '(?:'.$net_path.'|'.$abs_path.'|'.$rel_path.')(?:\?'.$query.')?';

$URI_reference = '(?:(?:'.$absoluteURI.'|'.$relativeURI.')(?:\#'.$fragment.')?|'.
                                                            '\#'.$fragment.')';
sub URI_reference { $_[0] =~ /^$URI_reference$/o; }

# RFC2396 Appendix B
$URI_parsing = '(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?';


##############

# FTP (see also RFC959)
$ftptype       = '[AIDaid]';
$fsegment      = '(?:['.$_unreserved.'?:@&=+$,]|'.$escaped.')*';
$fpath         = $fsegment.'(?:/'.$fsegment.')*';
$password      = $userinfo;
$login         = '(?:'.$userinfo.'(?::'.$password.')?\@)?'.$hostport;
$URL_ftp       = 'ftp://'.$login.'(?:/'.$fpath.'(?:;type='.$ftptype.')?)?';
sub URL_ftp { $_[0] =~ /^$URL_ftp$/o; }

# FILE (see also RFC1738)
$URL_file      = 'file://(?:'. $host.'|localhost)?/'.$fpath;
sub URL_file { $_[0] =~ /^$URL_file$/o; }

# HTTP (see also RFC1738)
$hsegment      = '(?:['.$_unreserved.';:@&=+$,]|'.$escaped.')*';
$hpath         = $hsegment.'(?:/'.$hsegment.')*';
$URL_http      = 'https?://'.$hostport.'(?:/'.$hpath.'(?:\?'.$query.')?)?';
sub URL_http { $_[0] =~ /^$URL_http$/o; }

# GOPHER (see also RFC1436)
$gopher_string = $uric.'*';
$selector      = $uric.'*';
$gtype         = $uric;
$URL_gopher    = 'gopher://'.$hostport.'(?:/(?:'.$gtype.'(?:'.$selector.
                 '(?:\t'.$query.'(?:\t'.$gopher_string.')?)?)?)?)?';
sub URL_gopher { $_[0] =~ /^$URL_gopher$/o; }

# MAILTO (see also RFC822/RFC2368)
$_urlc         = $_unreserved.';/:@+$,'; # char class
$urlc          = '(?:['.$_urlc.']|'.$escaped.')';
$hname         = $urlc.'*';
$hvalue        = $urlc.'*';
$header        = $hname.'='.$hvalue;
$headers       = '(\?'.$header.'(?:&'.$header.')*)?';
$to            = '('.$urlc.'*)';
$URL_mailto    = 'mailto:'.$to.$headers;

# Regular expression built using Jeffrey Friedl's example in
# _Mastering Regular Expressions_ (http://www.ora.com/catalog/regexp/).
$RFC822PAT = <<'EOF';
[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\
xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xf
f\n\015()]*)*\)[\040\t]*)*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\x
ff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015
"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\
xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80
-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*
)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\
\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\
x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x8
0-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n
\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040
\t]*)*)*@[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([
^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\
\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\
x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()
]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\
x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\04
0\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\
n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\
015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?!
[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\
]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\
x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\01
5()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*|(?:[^(\040)<>@,;:".
\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]
)|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^
()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037]*(?:(?:\([^\\\x80-\xff\n\0
15()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][
^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)|"[^\\\x80-\xff\
n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^()<>@,;:".\\\[\]\
x80-\xff\000-\010\012-\037]*)*<[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?
:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-
\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:@[\040\t]*
(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015
()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()
]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\0
40)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\
[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\
xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*
)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80
-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x
80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t
]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\
\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])
*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x
80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80
-\xff\n\015()]*)*\)[\040\t]*)*)*(?:,[\040\t]*(?:\([^\\\x80-\xff\n\015(
)]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\
\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*@[\040\t
]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\0
15()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015
()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(
\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|
\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80
-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()
]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040
\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".
\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff
])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\
\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x
80-\xff\n\015()]*)*\)[\040\t]*)*)*)*:[\040\t]*(?:\([^\\\x80-\xff\n\015
()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\
\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)?(?:[^
(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-
\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\
n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|
\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))
[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff
\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\x
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(
?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\
000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\
xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\x
ff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)
*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*@[\040\t]*(?:\([^\\\x80-\x
ff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-
\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)
*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\
]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
)[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-
\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\x
ff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(
?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80
-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8
0-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?:
\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]
*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)
*\)[\040\t]*)*)*>)
EOF
$RFC822PAT =~ s/\n//g;

sub URL_mailto {
  unless ($_[0] =~ /^$URL_mailto$/o) { return 0; }
  my ($to, $headers) = ($1, $2);
  if ($to eq '') { return $headers ne ''; }
  $to =~ s/\%($hex2)/chr(hex($1))/oge;
  $to =~ /^$RFC822PAT$/o;
}

# NEWS (see also RFC1036)
$article       = '(?:['.$_unreserved.';/?:&=+$,]|'.$escaped.')+\@'.$host;
$newsgroup     = '(?:'.$alpha.'['.$_alphanum.'\-.+_]*)';
$grouppart     = '(?:\*|'.$newsgroup.'|'.$article.')';
$URL_news      = 'news:'.$grouppart;
sub URL_news { $_[0] =~ /^$URL_news$/o; }

# NNTP (see also RFC977)
$URL_nntp      = 'nntp://'.$hostport.'/'.$newsgroup.'(?:/'.$digit.'+)?';
sub URL_nntp { $_[0] =~ /^$URL_nntp$/o; }

# TELNET (see also RFC1738)
$URL_telnet    = 'telnet://'.$login.'/?';
sub URL_telnet { $_[0] =~ /^$URL_telnet$/o; }

# WAIS (see also RFC1625)
$wpath         = '(?:['.$_unreserved.'+$,]|'.$escaped.')*';
$wtype         = '(?:['.$_unreserved.'+$,]|'.$escaped.')*';
$database      = '(?:['.$_unreserved.'+$,]|'.$escaped.')*';
$waisdoc       = $database.'/'.$wtype.'/'.$wpath;
$waisindex     = $database.'\?'.$query;
$waisdatabase  = $database;
$URL_wais      = 'wais://'.$hostport.'/(?:'.$waisdatabase.'|'.$waisindex.'|'.$waisdoc.')';
sub URL_wais { $_[0] =~ /^$URL_wais$/o; }

# PROSPERO (see also RFC1738)
$fieldvalue    = '(?:['.$_unreserved.'?:@&+$,]|'.$escaped.')*';
$fieldname     = '(?:['.$_unreserved.'?:@&+$,]|'.$escaped.')*';
$fieldspec     = ';'.$fieldname.'='.$fieldvalue;
$psegment      = '(?:['.$_unreserved.'?:@&=+$,]|'.$escaped.')*';
$ppath         = $psegment.'(?:/'.$psegment.')*';
$URL_prospero  = 'prospero://'.$hostport.'/'.$ppath.'(?:'.$fieldspec.')*';
sub URL_prospero { $_[0] =~ /^$URL_prospero$/o; }

1;
