mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
626 lines
16 KiB
626 lines
16 KiB
/*
|
|
This file was derived from the libwww code, version 2.15, from CERN.
|
|
A number of modifications have been made by Spyglass.
|
|
|
|
[email protected]
|
|
*/
|
|
/* Parse HyperText Document Address HTParse.c
|
|
** ================================
|
|
*/
|
|
|
|
#include "all.h"
|
|
|
|
#define HEX_ESCAPE '%'
|
|
|
|
struct struct_parts
|
|
{
|
|
char *access;
|
|
char *host;
|
|
char *absolute;
|
|
char *relative;
|
|
/* char * search; no - treated as part of path */
|
|
char *anchor;
|
|
};
|
|
|
|
|
|
/* Strip white space off a string
|
|
** ------------------------------
|
|
**
|
|
** On exit,
|
|
** Return value points to first non-white character, or to 0 if none.
|
|
** All trailing white space is OVERWRITTEN with zero.
|
|
** CMF 3-29-95: strip everything after a cr or lf
|
|
** CMF 6-08-95: rather than strip everthing after a cr or lf, just delete them
|
|
** this is for compatibility w/ netscape and some URLs
|
|
*/
|
|
|
|
PUBLIC char *HTStrip(char *s)
|
|
{
|
|
#define ISSPACE(c) ((c==' ')||(c=='\t')||(c=='\n')||(c=='\r'))
|
|
char *p = s;
|
|
char *q;
|
|
|
|
if (!s)
|
|
return NULL; /* Doesn't dump core if NULL */
|
|
while (ISSPACE(*s))
|
|
s++; /* Strip leading blanks */
|
|
for (p = s, q = s; *q; q++) /* Find end of string & strip embedded CR/LF */
|
|
if (*q != CR && *q != LF)
|
|
*p++ = *q;
|
|
*p = '\0';
|
|
for (p--; p >= s; p--)
|
|
{
|
|
if (ISSPACE(*p))
|
|
*p = 0; /* Zap trailing blanks */
|
|
else
|
|
break;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
INLINE BOOL bIsAbsolute(const char *p)
|
|
{
|
|
return (*p == '/' || *p == '\\' || (*p && (p[1] == ':' || p[1] == '|')));
|
|
}
|
|
|
|
INLINE BOOL bIsUNC(const char *p)
|
|
{
|
|
return (!strncmp(p, "\\\\", 2)) || (!strncmp(p, "//", 2));
|
|
}
|
|
|
|
INLINE BOOL bIsDrive(const char *p)
|
|
{
|
|
return (p[1] == ':' || p[1] == '|');
|
|
}
|
|
|
|
/* Scan a filename for its consituents
|
|
** -----------------------------------
|
|
**
|
|
** On entry,
|
|
** name points to a document name which may be incomplete.
|
|
** On exit,
|
|
** absolute or relative may be nonzero (but not both).
|
|
** host, anchor and access may be nonzero if they were specified.
|
|
** Any which are nonzero point to zero terminated strings.
|
|
*/
|
|
PRIVATE void scan(char *name, struct struct_parts *parts)
|
|
{
|
|
char *after_access;
|
|
char *p;
|
|
int length = strlen(name);
|
|
|
|
parts->access = 0;
|
|
parts->host = 0;
|
|
parts->absolute = 0;
|
|
parts->relative = 0;
|
|
parts->anchor = 0;
|
|
|
|
after_access = name;
|
|
|
|
for (p = name; *p; p++)
|
|
{
|
|
if (*p == ':')
|
|
{
|
|
*p = 0;
|
|
GTR_MakeStringLowerCase(after_access);
|
|
if (!strcmp(after_access,"url"))
|
|
{
|
|
after_access = p + 1;
|
|
continue;
|
|
}
|
|
parts->access = after_access; /* Access name has been specified */
|
|
after_access = p + 1;
|
|
break;
|
|
}
|
|
if (*p == '/')
|
|
break;
|
|
if (*p == '#')
|
|
break;
|
|
}
|
|
|
|
for (p = name + length - 1; p >= name; p--)
|
|
{
|
|
if (*p == '#')
|
|
{
|
|
parts->anchor = p + 1;
|
|
*p = 0; /* terminate the rest */
|
|
}
|
|
}
|
|
p = after_access;
|
|
if (parts->access && !strcmp(parts->access, "file"))
|
|
{
|
|
// sometimes file: URLS look like file:///c|/blah.htm
|
|
// or file:///blah.htm
|
|
// or file:c:/blah.htm
|
|
// or file:/blah.htm
|
|
if (!strncmp(p, "///", 3))
|
|
{
|
|
p += 3;
|
|
}
|
|
if (bIsAbsolute(p))
|
|
{
|
|
parts->absolute = ((*p != '\\' && *p != '/') || (!strncmp(p, "\\\\", 2)) ||(!strncmp(p, "//", 2))) ? p : p + 1;
|
|
}
|
|
else
|
|
{
|
|
parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (*p == '/')
|
|
{
|
|
if (p[1] == '/')
|
|
{
|
|
char *phost;
|
|
|
|
parts->host = p + 2; /* host has been specified */
|
|
*p = 0; /* Terminate access */
|
|
p = strchr(parts->host, '/'); /* look for end of host name if any */
|
|
if (p)
|
|
{
|
|
*p = 0; /* Terminate host */
|
|
parts->absolute = p + 1; /* Root has been found */
|
|
}
|
|
phost = NULL;
|
|
if (parts->access && ((!strcmp(parts->access,"ftp")) || (!strcmp(parts->access,"telnet"))))
|
|
{
|
|
phost = strrchr(parts->host, '@'); /* login for telnet,ftp */
|
|
if (!phost) phost = parts->host;
|
|
}
|
|
else if (parts->access && ((!strcmp(parts->access,"gopher")) || (!strcmp(parts->access,"http")) || (!strcmp(parts->access,"wais"))))
|
|
phost = parts->host;
|
|
if (phost) GTR_MakeStringLowerCase(phost);
|
|
}
|
|
else
|
|
{
|
|
parts->absolute = p + 1; /* Root found but no host */
|
|
}
|
|
}
|
|
else if (*p == '\\')
|
|
{
|
|
parts->absolute = p[1] == '\\' ? p : p + 1;
|
|
}
|
|
else
|
|
{
|
|
parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
|
|
}
|
|
}
|
|
|
|
#ifdef OLD_CODE
|
|
/* Access specified but no host: the anchor was not really one
|
|
e.g. news:j462#[email protected] -- JFG 10/jul/92, from bug report */
|
|
/* This kludge doesn't work for example when coming across
|
|
file:/usr/local/www/fred#123
|
|
which loses its anchor. Correct approach in news is to
|
|
escape weird characters not allowed in URL. TBL 21/dec/93
|
|
*/
|
|
if (parts->access && !parts->host && parts->anchor)
|
|
{
|
|
*(parts->anchor - 1) = '#'; /* Restore the '#' in the address */
|
|
parts->anchor = 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef NOT_DEFINED /* search is just treated as part of path */
|
|
{
|
|
char *p = relative ? relative : absolute;
|
|
if (p)
|
|
{
|
|
char *q = strchr(p, '?'); /* Any search string? */
|
|
if (q)
|
|
{
|
|
*q = 0; /* If so, chop that off. */
|
|
parts->search = q + 1;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
} /*scan */
|
|
|
|
/* Parse a Name relative to another name
|
|
** -------------------------------------
|
|
**
|
|
** This returns those parts of a name which are given (and requested)
|
|
** substituting bits from the related name where necessary.
|
|
**
|
|
** On entry,
|
|
** aName A filename given
|
|
** relatedName A name relative to which aName is to be parsed
|
|
** wanted A mask for the bits which are wanted.
|
|
**
|
|
** On exit,
|
|
** returns A pointer to a malloc'd string which MUST BE FREED
|
|
*/
|
|
char *HTParse(const char *aName, const char *relatedName, int wanted)
|
|
{
|
|
char *return_value = 0;
|
|
char *p;
|
|
char *access;
|
|
struct struct_parts given, related;
|
|
char name[MAX_URL_STRING+1];
|
|
char rel[MAX_URL_STRING+1];
|
|
char result[2*MAX_URL_STRING+1]; /* Make this longer to avoid overflow */
|
|
BOOL bIsFile;
|
|
|
|
/* Make working copies of input strings to cut up:
|
|
*/
|
|
GTR_strncpy(name, aName, MAX_URL_STRING);
|
|
GTR_strncpy(rel, relatedName, MAX_URL_STRING);
|
|
|
|
scan(name, &given);
|
|
scan(rel, &related);
|
|
result[0] = 0; /* Clear string */
|
|
access = given.access ? given.access : related.access;
|
|
if (wanted & PARSE_ACCESS)
|
|
if (access)
|
|
{
|
|
strcat(result, access);
|
|
if (wanted & PARSE_PUNCTUATION)
|
|
strcat(result, ":");
|
|
}
|
|
|
|
if (given.access && related.access) /* If different, inherit nothing. */
|
|
if (strcmp(given.access, related.access) != 0)
|
|
{
|
|
related.host = 0;
|
|
related.absolute = 0;
|
|
related.relative = 0;
|
|
related.anchor = 0;
|
|
}
|
|
bIsFile = (access && (!_stricmp(access, "file")));
|
|
|
|
if (wanted & PARSE_HOST)
|
|
if (given.host || related.host)
|
|
{
|
|
char *tail = result + strlen(result);
|
|
if (wanted & PARSE_PUNCTUATION)
|
|
strcat(result, "//");
|
|
if (given.host)
|
|
{
|
|
strcat(result, given.host);
|
|
}
|
|
else
|
|
{
|
|
strcat(result, related.host);
|
|
}
|
|
|
|
/* Ignore default port numbers, and trailing dots on FQDNs
|
|
which will only cause identical adreesses to look different */
|
|
{
|
|
char *p;
|
|
p = strchr(tail, ':');
|
|
if (p && access)
|
|
{ /* Port specified */
|
|
if ( ( strcmp(access, "http") == 0
|
|
&& strcmp(p, ":80") == 0)
|
|
|| ( strcmp(access, "gopher") == 0
|
|
&& strcmp(p, ":70") == 0)
|
|
#ifdef HTTPS_ACCESS_TYPE
|
|
|| ( strcmp(access, "https") == 0
|
|
&& strcmp(p, ":443") == 0)
|
|
#endif
|
|
#ifdef SHTTP_ACCESS_TYPE
|
|
|| ( strcmp(access, "shttp") == 0
|
|
&& strcmp(p, ":80") == 0)
|
|
#endif
|
|
)
|
|
*p = (char) 0; /* It is the default: ignore it */
|
|
}
|
|
if (!p)
|
|
p = tail + strlen(tail); /* After hostname */
|
|
if (strlen (p)) /* -dpg */
|
|
{
|
|
p--; /* End of hostname */
|
|
if (*p == '.')
|
|
*p = (char) 0; /* chop final . */
|
|
}
|
|
}
|
|
}
|
|
|
|
if (given.host && related.host) /* If different hosts, inherit no path. */
|
|
if (strcmp(given.host, related.host) != 0)
|
|
{
|
|
related.absolute = 0;
|
|
related.relative = 0;
|
|
related.anchor = 0;
|
|
}
|
|
|
|
if (wanted & PARSE_PATH)
|
|
{
|
|
BOOL bOKToInheritPath = (!given.host);
|
|
BOOL bRIsUNC = bIsFile && related.absolute && bIsUNC(related.absolute);
|
|
BOOL bRIsDrive = bIsFile && related.absolute && bIsDrive(related.absolute);
|
|
|
|
if (given.absolute)
|
|
{
|
|
BOOL bBack = FALSE;
|
|
|
|
// All is given, except for perhaps the drive or unc root
|
|
if (bIsFile)
|
|
{
|
|
BOOL bGIsUNC = given.absolute && bIsUNC(given.absolute);
|
|
BOOL bGIsDrive = given.absolute && bIsDrive(given.absolute);
|
|
|
|
bBack = bRIsUNC || bRIsDrive || bGIsUNC || bGIsDrive || strchr(given.absolute, '\\');
|
|
if ((!(bGIsUNC || bGIsDrive)) && (bRIsUNC || bRIsDrive))
|
|
{
|
|
const char *e = NULL;
|
|
|
|
if (bRIsDrive)
|
|
{
|
|
e = related.absolute + 2;
|
|
}
|
|
else
|
|
{
|
|
if (e = strchr(related.absolute + 2, '\\'))
|
|
e = strchr(e + 1, '\\');
|
|
}
|
|
if (e)
|
|
GTR_strncat(result, related.absolute, e - related.absolute);
|
|
}
|
|
}
|
|
if (((!bIsFile) || !bIsAbsolute(given.absolute)) && (wanted & PARSE_PUNCTUATION))
|
|
strcat(result, bBack ? "\\" : "/");
|
|
strcat(result, given.absolute);
|
|
}
|
|
else if (related.absolute && bOKToInheritPath)
|
|
{ /* Adopt path not name */
|
|
char slash1 = '/';
|
|
char slash2 = slash1;
|
|
BOOL bBack = FALSE;
|
|
|
|
if (bIsFile)
|
|
{
|
|
slash2 = '\\';
|
|
bBack = bRIsUNC || bRIsDrive || strchr(related.absolute, '\\');
|
|
}
|
|
if ((!bIsFile) || !bIsAbsolute(related.absolute))
|
|
strcat(result, bBack ? "\\" : "/");
|
|
strcat(result, related.absolute);
|
|
|
|
if (given.relative)
|
|
{
|
|
p = strchr(result, '?'); /* Search part? */
|
|
if (!p)
|
|
p = result + strlen(result) - 1;
|
|
for (; *p != slash1 && *p != slash2; p--) ; /* last / */
|
|
p[1] = 0; /* Remove filename */
|
|
strcat(result, given.relative); /* Add given one */
|
|
HTSimplify(result);
|
|
}
|
|
}
|
|
else if (given.relative)
|
|
{
|
|
/* The following 3 lines were copied from NCSA Mosaic for Windows */
|
|
if ((wanted & PARSE_HOST) && (given.host || related.host) && (wanted & PARSE_PUNCTUATION))
|
|
if (result[strlen(result) - 1] != '/')
|
|
strcat(result, "/");
|
|
strcat(result, given.relative); /* what we've got */
|
|
}
|
|
else if (related.relative && bOKToInheritPath)
|
|
{
|
|
strcat(result, related.relative);
|
|
}
|
|
else
|
|
{ /* No inheritance */
|
|
strcat(result, "/");
|
|
}
|
|
}
|
|
|
|
if (wanted & PARSE_ANCHOR)
|
|
if (given.anchor || related.anchor)
|
|
{
|
|
if (wanted & PARSE_PUNCTUATION)
|
|
strcat(result, "#");
|
|
strcat(result, given.anchor ? given.anchor : related.anchor);
|
|
}
|
|
|
|
/* We truncate URLs to 1024 bytes if they're too long. */
|
|
result[MAX_URL_STRING] = '\0';
|
|
return_value = GTR_strdup(result);
|
|
|
|
return return_value; /* exactly the right length */
|
|
}
|
|
|
|
|
|
/*
|
|
** As strcpy() but guaranteed to work correctly
|
|
** with overlapping parameters. AL 7 Feb 1994
|
|
*/
|
|
PRIVATE void ari_strcpy(char *to, char *from)
|
|
{
|
|
char *tmp;
|
|
|
|
if (!to || !from)
|
|
return;
|
|
|
|
tmp = (char *) GTR_MALLOC(strlen(from) + 1);
|
|
if (tmp)
|
|
{
|
|
strcpy(tmp, from);
|
|
strcpy(to, tmp);
|
|
GTR_FREE(tmp);
|
|
}
|
|
else
|
|
{
|
|
/* TODO */
|
|
}
|
|
}
|
|
/* Simplify a filename
|
|
// -------------------
|
|
//
|
|
// A unix-style file is allowed to contain the seqeunce xxx/../ which may be
|
|
// replaced by "" , and the seqeunce "/./" which may be replaced by "/".
|
|
// Simplification helps us recognize duplicate filenames.
|
|
//
|
|
// Thus, /etc/junk/../fred becomes /etc/fred
|
|
// /etc/junk/./fred becomes /etc/junk/fred
|
|
//
|
|
// but we should NOT change
|
|
// http://fred.xxx.edu/../..
|
|
//
|
|
// or ../../albert.html
|
|
//
|
|
// CMF 5/26/95. Note that many servers now bounce requests like
|
|
// http://fred.xxx.edu/../fred.gif with a 403 for security.
|
|
*/
|
|
PUBLIC void HTSimplify(char *filename)
|
|
{
|
|
char *p = filename;
|
|
char *q;
|
|
|
|
if (p)
|
|
{
|
|
while (*p && (*p == '/' || *p == '.')) /* Pass starting / or .'s */
|
|
p++;
|
|
while (*p)
|
|
{
|
|
if (*p == '/')
|
|
{
|
|
if ((p[1] == '.') && (p[2] == '.') && (p[3] == '/' || !p[3]))
|
|
{
|
|
for (q = p - 1; (q >= filename) && (*q != '/'); q--) ; /* prev slash */
|
|
if (q[0] == '/' && 0 != strncmp(q, "/../", 4))
|
|
{
|
|
if (!(q - 1 > filename && q[-1] == '/'))
|
|
{
|
|
ari_strcpy(q, p + 3); /* Remove /xxx/.. */
|
|
if (!*filename)
|
|
strcpy(filename, "/");
|
|
p = q - 1; /* Start again with prev slash */
|
|
}
|
|
else
|
|
{
|
|
ari_strcpy(p, p + 3); /* Remove starting ../ */
|
|
p = p - 1; /* Start over with rest of path */
|
|
}
|
|
}
|
|
else
|
|
{ /* xxx/.. leave it! */
|
|
#ifdef BUG_CODE
|
|
ari_strcpy(filename, p[3] ? p + 4 : p + 3); /* rm xxx/../ */
|
|
p = filename; /* Start again */
|
|
#endif
|
|
}
|
|
}
|
|
else if ((p[1] == '.') && (p[2] == '/' || !p[2]))
|
|
{
|
|
ari_strcpy(p, p + 2); /* Remove a slash and a dot */
|
|
}
|
|
#if 0
|
|
else if (p[-1] != ':')
|
|
{
|
|
while (p[1] == '/')
|
|
{
|
|
ari_strcpy(p, p + 1); /* Remove multiple slashes */
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
p++;
|
|
} /* end while (*p) */
|
|
} /* end if (p) */
|
|
}
|
|
|
|
|
|
/* Escape undesirable characters using % HTEscape()
|
|
** -------------------------------------
|
|
**
|
|
** This function takes a pointer to a string in which
|
|
** some characters may be unacceptable unescaped.
|
|
** It returns a string which has these characters
|
|
** represented by a '%' character followed by two hex digits.
|
|
**
|
|
** Unlike HTUnEscape(), this routine returns a malloced string.
|
|
*/
|
|
|
|
PRIVATE CONST unsigned char isAcceptable[96] =
|
|
|
|
/* Bit 0 xalpha -- see HTFile.h
|
|
** Bit 1 xpalpha -- as xalpha but with plus.
|
|
** Bit 2 ... path -- as xpalphas but with /
|
|
*/
|
|
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
|
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 7, 7, 4, /* 2x !"#$%&'()*+,-./ */
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, /* 3x 0123456789:;<=>? */
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 4x @ABCDEFGHIJKLMNO */
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, /* 5X PQRSTUVWXYZ[\]^_ */
|
|
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 6x `abcdefghijklmno */
|
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0}; /* 7X pqrstuvwxyz{\}~ DEL */
|
|
|
|
PRIVATE char *hex = "0123456789ABCDEF";
|
|
|
|
PUBLIC char *HTEscape(CONST char *str, unsigned char mask, char protect)
|
|
{
|
|
#define ACCEPTABLE(a) ((a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) || a == protect)
|
|
CONST char *p;
|
|
char *q;
|
|
char *result;
|
|
int unacceptable = 0;
|
|
|
|
for (p = str; *p; p++)
|
|
if (!ACCEPTABLE((unsigned char) (*p)))
|
|
unacceptable++;
|
|
result = (char *) GTR_MALLOC(p - str + unacceptable + unacceptable + 1);
|
|
if (result)
|
|
{
|
|
for (q = result, p = str; *p; p++)
|
|
{
|
|
unsigned char a = *p;
|
|
if (!ACCEPTABLE(a))
|
|
{
|
|
*q++ = HEX_ESCAPE; /* Means hex commming */
|
|
*q++ = hex[a >> 4];
|
|
*q++ = hex[a & 15];
|
|
}
|
|
else
|
|
*q++ = *p;
|
|
}
|
|
*q++ = 0; /* Terminate */
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/* Decode %xx escaped characters HTUnEscape()
|
|
** -----------------------------
|
|
**
|
|
** This function takes a pointer to a string in which some
|
|
** characters may have been encoded in %xy form, where xy is
|
|
** the acsii hex code for character 16x+y.
|
|
** The string is converted in place, as it will never grow.
|
|
*/
|
|
|
|
PRIVATE char from_hex(char c)
|
|
{
|
|
return c >= '0' && c <= '9' ? c - '0'
|
|
: c >= 'A' && c <= 'F' ? c - 'A' + 10
|
|
: c - 'a' + 10; /* accept small letters just in case */
|
|
}
|
|
|
|
PUBLIC char *HTUnEscape(char *str)
|
|
{
|
|
char *p = str;
|
|
char *q = str;
|
|
while (*p)
|
|
{
|
|
if (*p == HEX_ESCAPE)
|
|
{
|
|
p++;
|
|
if (*p)
|
|
*q = from_hex(*p++) * 16;
|
|
if (*p)
|
|
*q = *q + from_hex(*p++);
|
|
q++;
|
|
}
|
|
else
|
|
{
|
|
*q++ = *p++;
|
|
}
|
|
}
|
|
|
|
*q++ = 0;
|
|
return str;
|
|
|
|
} /* HTUnEscape */
|