Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

626 lines
16 KiB

/*
This file was derived from the libwww code, version 2.15, from CERN.
A number of modifications have been made by Spyglass.
[email protected]
*/
/* Parse HyperText Document Address HTParse.c
** ================================
*/
#include "all.h"
#define HEX_ESCAPE '%'
struct struct_parts
{
char *access;
char *host;
char *absolute;
char *relative;
/* char * search; no - treated as part of path */
char *anchor;
};
/* Strip white space off a string
** ------------------------------
**
** On exit,
** Return value points to first non-white character, or to 0 if none.
** All trailing white space is OVERWRITTEN with zero.
** CMF 3-29-95: strip everything after a cr or lf
** CMF 6-08-95: rather than strip everthing after a cr or lf, just delete them
** this is for compatibility w/ netscape and some URLs
*/
PUBLIC char *HTStrip(char *s)
{
#define ISSPACE(c) ((c==' ')||(c=='\t')||(c=='\n')||(c=='\r'))
char *p = s;
char *q;
if (!s)
return NULL; /* Doesn't dump core if NULL */
while (ISSPACE(*s))
s++; /* Strip leading blanks */
for (p = s, q = s; *q; q++) /* Find end of string & strip embedded CR/LF */
if (*q != CR && *q != LF)
*p++ = *q;
*p = '\0';
for (p--; p >= s; p--)
{
if (ISSPACE(*p))
*p = 0; /* Zap trailing blanks */
else
break;
}
return s;
}
INLINE BOOL bIsAbsolute(const char *p)
{
return (*p == '/' || *p == '\\' || (*p && (p[1] == ':' || p[1] == '|')));
}
INLINE BOOL bIsUNC(const char *p)
{
return (!strncmp(p, "\\\\", 2)) || (!strncmp(p, "//", 2));
}
INLINE BOOL bIsDrive(const char *p)
{
return (p[1] == ':' || p[1] == '|');
}
/* Scan a filename for its consituents
** -----------------------------------
**
** On entry,
** name points to a document name which may be incomplete.
** On exit,
** absolute or relative may be nonzero (but not both).
** host, anchor and access may be nonzero if they were specified.
** Any which are nonzero point to zero terminated strings.
*/
PRIVATE void scan(char *name, struct struct_parts *parts)
{
char *after_access;
char *p;
int length = strlen(name);
parts->access = 0;
parts->host = 0;
parts->absolute = 0;
parts->relative = 0;
parts->anchor = 0;
after_access = name;
for (p = name; *p; p++)
{
if (*p == ':')
{
*p = 0;
GTR_MakeStringLowerCase(after_access);
if (!strcmp(after_access,"url"))
{
after_access = p + 1;
continue;
}
parts->access = after_access; /* Access name has been specified */
after_access = p + 1;
break;
}
if (*p == '/')
break;
if (*p == '#')
break;
}
for (p = name + length - 1; p >= name; p--)
{
if (*p == '#')
{
parts->anchor = p + 1;
*p = 0; /* terminate the rest */
}
}
p = after_access;
if (parts->access && !strcmp(parts->access, "file"))
{
// sometimes file: URLS look like file:///c|/blah.htm
// or file:///blah.htm
// or file:c:/blah.htm
// or file:/blah.htm
if (!strncmp(p, "///", 3))
{
p += 3;
}
if (bIsAbsolute(p))
{
parts->absolute = ((*p != '\\' && *p != '/') || (!strncmp(p, "\\\\", 2)) ||(!strncmp(p, "//", 2))) ? p : p + 1;
}
else
{
parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
}
}
else
{
if (*p == '/')
{
if (p[1] == '/')
{
char *phost;
parts->host = p + 2; /* host has been specified */
*p = 0; /* Terminate access */
p = strchr(parts->host, '/'); /* look for end of host name if any */
if (p)
{
*p = 0; /* Terminate host */
parts->absolute = p + 1; /* Root has been found */
}
phost = NULL;
if (parts->access && ((!strcmp(parts->access,"ftp")) || (!strcmp(parts->access,"telnet"))))
{
phost = strrchr(parts->host, '@'); /* login for telnet,ftp */
if (!phost) phost = parts->host;
}
else if (parts->access && ((!strcmp(parts->access,"gopher")) || (!strcmp(parts->access,"http")) || (!strcmp(parts->access,"wais"))))
phost = parts->host;
if (phost) GTR_MakeStringLowerCase(phost);
}
else
{
parts->absolute = p + 1; /* Root found but no host */
}
}
else if (*p == '\\')
{
parts->absolute = p[1] == '\\' ? p : p + 1;
}
else
{
parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
}
}
#ifdef OLD_CODE
/* Access specified but no host: the anchor was not really one
e.g. news:j462#[email protected] -- JFG 10/jul/92, from bug report */
/* This kludge doesn't work for example when coming across
file:/usr/local/www/fred#123
which loses its anchor. Correct approach in news is to
escape weird characters not allowed in URL. TBL 21/dec/93
*/
if (parts->access && !parts->host && parts->anchor)
{
*(parts->anchor - 1) = '#'; /* Restore the '#' in the address */
parts->anchor = 0;
}
#endif
#ifdef NOT_DEFINED /* search is just treated as part of path */
{
char *p = relative ? relative : absolute;
if (p)
{
char *q = strchr(p, '?'); /* Any search string? */
if (q)
{
*q = 0; /* If so, chop that off. */
parts->search = q + 1;
}
}
}
#endif
} /*scan */
/* Parse a Name relative to another name
** -------------------------------------
**
** This returns those parts of a name which are given (and requested)
** substituting bits from the related name where necessary.
**
** On entry,
** aName A filename given
** relatedName A name relative to which aName is to be parsed
** wanted A mask for the bits which are wanted.
**
** On exit,
** returns A pointer to a malloc'd string which MUST BE FREED
*/
char *HTParse(const char *aName, const char *relatedName, int wanted)
{
char *return_value = 0;
char *p;
char *access;
struct struct_parts given, related;
char name[MAX_URL_STRING+1];
char rel[MAX_URL_STRING+1];
char result[2*MAX_URL_STRING+1]; /* Make this longer to avoid overflow */
BOOL bIsFile;
/* Make working copies of input strings to cut up:
*/
GTR_strncpy(name, aName, MAX_URL_STRING);
GTR_strncpy(rel, relatedName, MAX_URL_STRING);
scan(name, &given);
scan(rel, &related);
result[0] = 0; /* Clear string */
access = given.access ? given.access : related.access;
if (wanted & PARSE_ACCESS)
if (access)
{
strcat(result, access);
if (wanted & PARSE_PUNCTUATION)
strcat(result, ":");
}
if (given.access && related.access) /* If different, inherit nothing. */
if (strcmp(given.access, related.access) != 0)
{
related.host = 0;
related.absolute = 0;
related.relative = 0;
related.anchor = 0;
}
bIsFile = (access && (!_stricmp(access, "file")));
if (wanted & PARSE_HOST)
if (given.host || related.host)
{
char *tail = result + strlen(result);
if (wanted & PARSE_PUNCTUATION)
strcat(result, "//");
if (given.host)
{
strcat(result, given.host);
}
else
{
strcat(result, related.host);
}
/* Ignore default port numbers, and trailing dots on FQDNs
which will only cause identical adreesses to look different */
{
char *p;
p = strchr(tail, ':');
if (p && access)
{ /* Port specified */
if ( ( strcmp(access, "http") == 0
&& strcmp(p, ":80") == 0)
|| ( strcmp(access, "gopher") == 0
&& strcmp(p, ":70") == 0)
#ifdef HTTPS_ACCESS_TYPE
|| ( strcmp(access, "https") == 0
&& strcmp(p, ":443") == 0)
#endif
#ifdef SHTTP_ACCESS_TYPE
|| ( strcmp(access, "shttp") == 0
&& strcmp(p, ":80") == 0)
#endif
)
*p = (char) 0; /* It is the default: ignore it */
}
if (!p)
p = tail + strlen(tail); /* After hostname */
if (strlen (p)) /* -dpg */
{
p--; /* End of hostname */
if (*p == '.')
*p = (char) 0; /* chop final . */
}
}
}
if (given.host && related.host) /* If different hosts, inherit no path. */
if (strcmp(given.host, related.host) != 0)
{
related.absolute = 0;
related.relative = 0;
related.anchor = 0;
}
if (wanted & PARSE_PATH)
{
BOOL bOKToInheritPath = (!given.host);
BOOL bRIsUNC = bIsFile && related.absolute && bIsUNC(related.absolute);
BOOL bRIsDrive = bIsFile && related.absolute && bIsDrive(related.absolute);
if (given.absolute)
{
BOOL bBack = FALSE;
// All is given, except for perhaps the drive or unc root
if (bIsFile)
{
BOOL bGIsUNC = given.absolute && bIsUNC(given.absolute);
BOOL bGIsDrive = given.absolute && bIsDrive(given.absolute);
bBack = bRIsUNC || bRIsDrive || bGIsUNC || bGIsDrive || strchr(given.absolute, '\\');
if ((!(bGIsUNC || bGIsDrive)) && (bRIsUNC || bRIsDrive))
{
const char *e = NULL;
if (bRIsDrive)
{
e = related.absolute + 2;
}
else
{
if (e = strchr(related.absolute + 2, '\\'))
e = strchr(e + 1, '\\');
}
if (e)
GTR_strncat(result, related.absolute, e - related.absolute);
}
}
if (((!bIsFile) || !bIsAbsolute(given.absolute)) && (wanted & PARSE_PUNCTUATION))
strcat(result, bBack ? "\\" : "/");
strcat(result, given.absolute);
}
else if (related.absolute && bOKToInheritPath)
{ /* Adopt path not name */
char slash1 = '/';
char slash2 = slash1;
BOOL bBack = FALSE;
if (bIsFile)
{
slash2 = '\\';
bBack = bRIsUNC || bRIsDrive || strchr(related.absolute, '\\');
}
if ((!bIsFile) || !bIsAbsolute(related.absolute))
strcat(result, bBack ? "\\" : "/");
strcat(result, related.absolute);
if (given.relative)
{
p = strchr(result, '?'); /* Search part? */
if (!p)
p = result + strlen(result) - 1;
for (; *p != slash1 && *p != slash2; p--) ; /* last / */
p[1] = 0; /* Remove filename */
strcat(result, given.relative); /* Add given one */
HTSimplify(result);
}
}
else if (given.relative)
{
/* The following 3 lines were copied from NCSA Mosaic for Windows */
if ((wanted & PARSE_HOST) && (given.host || related.host) && (wanted & PARSE_PUNCTUATION))
if (result[strlen(result) - 1] != '/')
strcat(result, "/");
strcat(result, given.relative); /* what we've got */
}
else if (related.relative && bOKToInheritPath)
{
strcat(result, related.relative);
}
else
{ /* No inheritance */
strcat(result, "/");
}
}
if (wanted & PARSE_ANCHOR)
if (given.anchor || related.anchor)
{
if (wanted & PARSE_PUNCTUATION)
strcat(result, "#");
strcat(result, given.anchor ? given.anchor : related.anchor);
}
/* We truncate URLs to 1024 bytes if they're too long. */
result[MAX_URL_STRING] = '\0';
return_value = GTR_strdup(result);
return return_value; /* exactly the right length */
}
/*
** As strcpy() but guaranteed to work correctly
** with overlapping parameters. AL 7 Feb 1994
*/
PRIVATE void ari_strcpy(char *to, char *from)
{
char *tmp;
if (!to || !from)
return;
tmp = (char *) GTR_MALLOC(strlen(from) + 1);
if (tmp)
{
strcpy(tmp, from);
strcpy(to, tmp);
GTR_FREE(tmp);
}
else
{
/* TODO */
}
}
/* Simplify a filename
// -------------------
//
// A unix-style file is allowed to contain the seqeunce xxx/../ which may be
// replaced by "" , and the seqeunce "/./" which may be replaced by "/".
// Simplification helps us recognize duplicate filenames.
//
// Thus, /etc/junk/../fred becomes /etc/fred
// /etc/junk/./fred becomes /etc/junk/fred
//
// but we should NOT change
// http://fred.xxx.edu/../..
//
// or ../../albert.html
//
// CMF 5/26/95. Note that many servers now bounce requests like
// http://fred.xxx.edu/../fred.gif with a 403 for security.
*/
PUBLIC void HTSimplify(char *filename)
{
char *p = filename;
char *q;
if (p)
{
while (*p && (*p == '/' || *p == '.')) /* Pass starting / or .'s */
p++;
while (*p)
{
if (*p == '/')
{
if ((p[1] == '.') && (p[2] == '.') && (p[3] == '/' || !p[3]))
{
for (q = p - 1; (q >= filename) && (*q != '/'); q--) ; /* prev slash */
if (q[0] == '/' && 0 != strncmp(q, "/../", 4))
{
if (!(q - 1 > filename && q[-1] == '/'))
{
ari_strcpy(q, p + 3); /* Remove /xxx/.. */
if (!*filename)
strcpy(filename, "/");
p = q - 1; /* Start again with prev slash */
}
else
{
ari_strcpy(p, p + 3); /* Remove starting ../ */
p = p - 1; /* Start over with rest of path */
}
}
else
{ /* xxx/.. leave it! */
#ifdef BUG_CODE
ari_strcpy(filename, p[3] ? p + 4 : p + 3); /* rm xxx/../ */
p = filename; /* Start again */
#endif
}
}
else if ((p[1] == '.') && (p[2] == '/' || !p[2]))
{
ari_strcpy(p, p + 2); /* Remove a slash and a dot */
}
#if 0
else if (p[-1] != ':')
{
while (p[1] == '/')
{
ari_strcpy(p, p + 1); /* Remove multiple slashes */
}
}
#endif
}
p++;
} /* end while (*p) */
} /* end if (p) */
}
/* Escape undesirable characters using % HTEscape()
** -------------------------------------
**
** This function takes a pointer to a string in which
** some characters may be unacceptable unescaped.
** It returns a string which has these characters
** represented by a '%' character followed by two hex digits.
**
** Unlike HTUnEscape(), this routine returns a malloced string.
*/
PRIVATE CONST unsigned char isAcceptable[96] =
/* Bit 0 xalpha -- see HTFile.h
** Bit 1 xpalpha -- as xalpha but with plus.
** Bit 2 ... path -- as xpalphas but with /
*/
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 7, 7, 4, /* 2x !"#$%&'()*+,-./ */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, /* 3x 0123456789:;<=>? */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 4x @ABCDEFGHIJKLMNO */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, /* 5X PQRSTUVWXYZ[\]^_ */
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 6x `abcdefghijklmno */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0}; /* 7X pqrstuvwxyz{\}~ DEL */
PRIVATE char *hex = "0123456789ABCDEF";
PUBLIC char *HTEscape(CONST char *str, unsigned char mask, char protect)
{
#define ACCEPTABLE(a) ((a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) || a == protect)
CONST char *p;
char *q;
char *result;
int unacceptable = 0;
for (p = str; *p; p++)
if (!ACCEPTABLE((unsigned char) (*p)))
unacceptable++;
result = (char *) GTR_MALLOC(p - str + unacceptable + unacceptable + 1);
if (result)
{
for (q = result, p = str; *p; p++)
{
unsigned char a = *p;
if (!ACCEPTABLE(a))
{
*q++ = HEX_ESCAPE; /* Means hex commming */
*q++ = hex[a >> 4];
*q++ = hex[a & 15];
}
else
*q++ = *p;
}
*q++ = 0; /* Terminate */
}
return result;
}
/* Decode %xx escaped characters HTUnEscape()
** -----------------------------
**
** This function takes a pointer to a string in which some
** characters may have been encoded in %xy form, where xy is
** the acsii hex code for character 16x+y.
** The string is converted in place, as it will never grow.
*/
PRIVATE char from_hex(char c)
{
return c >= '0' && c <= '9' ? c - '0'
: c >= 'A' && c <= 'F' ? c - 'A' + 10
: c - 'a' + 10; /* accept small letters just in case */
}
PUBLIC char *HTUnEscape(char *str)
{
char *p = str;
char *q = str;
while (*p)
{
if (*p == HEX_ESCAPE)
{
p++;
if (*p)
*q = from_hex(*p++) * 16;
if (*p)
*q = *q + from_hex(*p++);
q++;
}
else
{
*q++ = *p++;
}
}
*q++ = 0;
return str;
} /* HTUnEscape */