windows-nt-4.0/private/inet/ohnt/ie/core/htparse.c


								/*

								   This file was derived from the libwww code, version 2.15, from CERN.

								   A number of modifications have been made by Spyglass.


								   eric@spyglass.com

								 */

								/*      Parse HyperText Document Address        HTParse.c

								   **       ================================

								 */


								#include "all.h"


								#define HEX_ESCAPE '%'


								struct struct_parts

								{

									char *access;

									char *host;

									char *absolute;

									char *relative;

								/*  char * search;      no - treated as part of path */

									char *anchor;

								};


								/*  Strip white space off a string

								   **   ------------------------------

								   **

								   ** On exit,

								   **   Return value points to first non-white character, or to 0 if none.

								   **   All trailing white space is OVERWRITTEN with zero.

								   ** CMF 3-29-95: strip everything after a cr or lf

								   ** CMF 6-08-95: rather than strip everthing after a cr or lf, just delete them

								   ** 			   this is for compatibility w/ netscape and some URLs

								 */


								PUBLIC char *HTStrip(char *s)

								{

								#define ISSPACE(c) ((c==' ')||(c=='\t')||(c=='\n')||(c=='\r'))

									char *p = s;

									char *q;


									if (!s)

										return NULL;			/* Doesn't dump core if NULL */

									while (ISSPACE(*s))

										s++;					/* Strip leading blanks */

									for (p = s, q = s; *q; q++) /* Find end of string & strip embedded CR/LF */

										 if (*q != CR && *q != LF)

										 	*p++ = *q;

									*p = '\0';

									for (p--; p >= s; p--)

									{

										if (ISSPACE(*p))

											*p = 0;				/* Zap trailing blanks */

										else

											break;

									}

									return s;

								}


								INLINE BOOL bIsAbsolute(const char *p)

								{

									return (*p == '/' || *p == '\\' || (*p && (p[1] == ':' || p[1] == '|')));

								}


								INLINE BOOL bIsUNC(const char *p)

								{

									return (!strncmp(p, "\\\\", 2)) || (!strncmp(p, "//", 2));

								}


								INLINE BOOL bIsDrive(const char *p)

								{

									return (p[1] == ':' || p[1] == '|');

								}


								/*  Scan a filename for its consituents

								   **   -----------------------------------

								   **

								   ** On entry,

								   **   name    points to a document name which may be incomplete.

								   ** On exit,

								   **      absolute or relative may be nonzero (but not both).

								   **   host, anchor and access may be nonzero if they were specified.

								   **   Any which are nonzero point to zero terminated strings.

								 */

								PRIVATE void scan(char *name, struct struct_parts *parts)

								{

									char *after_access;

									char *p;

									int length = strlen(name);


									parts->access = 0;

									parts->host = 0;

									parts->absolute = 0;

									parts->relative = 0;

									parts->anchor = 0;


									after_access = name;


									for (p = name; *p; p++)

									{

										if (*p == ':')

										{

											*p = 0;

											GTR_MakeStringLowerCase(after_access);

											if (!strcmp(after_access,"url"))

											{

												after_access = p + 1;

												continue;

											}

											parts->access = after_access;	/* Access name has been specified */

											after_access = p + 1;

											break;

										}

										if (*p == '/')

											break;

										if (*p == '#')

											break;

									}


									for (p = name + length - 1; p >= name; p--)

									{

										if (*p == '#')

										{

											parts->anchor = p + 1;

											*p = 0;				/* terminate the rest */

										}

									}

									p = after_access;

									if (parts->access && !strcmp(parts->access, "file"))

									{

									//	sometimes file: URLS look like 		file:///c|/blah.htm

									//							  or		file:///blah.htm

									//							  or		file:c:/blah.htm

									//							  or		file:/blah.htm

										if (!strncmp(p, "///", 3))

										{

											p += 3;

										}

										if (bIsAbsolute(p))

										{

											parts->absolute = ((*p != '\\' && *p != '/') || (!strncmp(p, "\\\\", 2)) ||(!strncmp(p, "//", 2))) ? p : p + 1;

										}

										else

										{

											parts->relative = (*after_access) ? after_access : 0;	/* zero for "" */

										}

									}

									else

									{

										if (*p == '/')

										{

											if (p[1] == '/')

											{

												char *phost;


												parts->host = p + 2;	/* host has been specified  */

												*p = 0;				/* Terminate access         */

												p = strchr(parts->host, '/');	/* look for end of host name if any */

												if (p)

												{

													*p = 0;			/* Terminate host */

													parts->absolute = p + 1;	/* Root has been found */

												}

												phost = NULL;

												if (parts->access && ((!strcmp(parts->access,"ftp")) ||	(!strcmp(parts->access,"telnet"))))

												{

													phost = strrchr(parts->host, '@'); 	/* login for telnet,ftp */

													if (!phost) phost = parts->host;

												}

												else if (parts->access && ((!strcmp(parts->access,"gopher")) || (!strcmp(parts->access,"http")) || (!strcmp(parts->access,"wais"))))

													phost = parts->host;

												if (phost) GTR_MakeStringLowerCase(phost);

											}

											else

											{

												parts->absolute = p + 1;	/* Root found but no host */

											}

										}

										else if (*p == '\\')

										{

											parts->absolute = p[1] == '\\' ? p : p + 1;

										}

										else

										{

											parts->relative = (*after_access) ? after_access : 0;	/* zero for "" */

										}

									}


								#ifdef OLD_CODE

									/* Access specified but no host: the anchor was not really one

									   e.g. news:j462#36487@foo.bar -- JFG 10/jul/92, from bug report */

									/* This kludge doesn't work for example when coming across

									   file:/usr/local/www/fred#123

									   which loses its anchor. Correct approach in news is to

									   escape weird characters not allowed in URL.  TBL 21/dec/93

									 */

									if (parts->access && !parts->host && parts->anchor)

									{

										*(parts->anchor - 1) = '#';		/* Restore the '#' in the address */

										parts->anchor = 0;

									}

								#endif


								#ifdef NOT_DEFINED				/* search is just treated as part of path */

									{

										char *p = relative ? relative : absolute;

										if (p)

										{

											char *q = strchr(p, '?');	/* Any search string? */

											if (q)

											{

												*q = 0;			/* If so, chop that off. */

												parts->search = q + 1;

											}

										}

									}

								#endif

								}								/*scan */


								/*  Parse a Name relative to another name

								   **   -------------------------------------

								   **

								   **   This returns those parts of a name which are given (and requested)

								   **   substituting bits from the related name where necessary.

								   **

								   ** On entry,

								   **   aName       A filename given

								   **      relatedName     A name relative to which aName is to be parsed

								   **      wanted          A mask for the bits which are wanted.

								   **

								   ** On exit,

								   **   returns     A pointer to a malloc'd string which MUST BE FREED

								 */

								char *HTParse(const char *aName, const char *relatedName, int wanted)

								{

									char *return_value = 0;

									char *p;

									char *access;

									struct struct_parts given, related;

									char name[MAX_URL_STRING+1];

									char rel[MAX_URL_STRING+1];

									char result[2*MAX_URL_STRING+1];	/* Make this longer to avoid overflow */

									BOOL bIsFile;


									/* Make working copies of input strings to cut up:

									 */

								 	GTR_strncpy(name, aName, MAX_URL_STRING);

								 	GTR_strncpy(rel, relatedName, MAX_URL_STRING);


									scan(name, &given);

									scan(rel, &related);

									result[0] = 0;				/* Clear string  */

									access = given.access ? given.access : related.access;

									if (wanted & PARSE_ACCESS)

										if (access)

										{

											strcat(result, access);

											if (wanted & PARSE_PUNCTUATION)

												strcat(result, ":");

										}


									if (given.access && related.access)		/* If different, inherit nothing. */

										if (strcmp(given.access, related.access) != 0)

										{

											related.host = 0;

											related.absolute = 0;

											related.relative = 0;

											related.anchor = 0;

										}

									bIsFile = (access && (!_stricmp(access, "file")));


									if (wanted & PARSE_HOST)

										if (given.host || related.host)

										{

											char *tail = result + strlen(result);

											if (wanted & PARSE_PUNCTUATION)

												strcat(result, "//");

											if (given.host)

											{

												strcat(result, given.host);

											}

											else

											{

												strcat(result, related.host);

											}


											/* Ignore default port numbers, and trailing dots on FQDNs

											   which will only cause identical adreesses to look different */

											{

												char *p;

												p = strchr(tail, ':');

												if (p && access)

												{				/* Port specified */

								 					if (   (   strcmp(access, "http") == 0

								 							&& strcmp(p, ":80") == 0)

								 						|| (   strcmp(access, "gopher") == 0

								 							&& strcmp(p, ":70") == 0)

								#ifdef HTTPS_ACCESS_TYPE

								 						|| (   strcmp(access, "https") == 0

								 							&& strcmp(p, ":443") == 0)

								#endif

								#ifdef SHTTP_ACCESS_TYPE

								 						|| (   strcmp(access, "shttp") == 0

								 							&& strcmp(p, ":80") == 0)

								#endif

													   )

														*p = (char) 0;	/* It is the default: ignore it */

												}

												if (!p)

													p = tail + strlen(tail);	/* After hostname */

												if (strlen (p))	/* -dpg */

												{

													p--;			/* End of hostname */

													if (*p == '.')

														*p = (char) 0;	/* chop final . */

												}

											}

										}


									if (given.host && related.host)		/* If different hosts, inherit no path. */

										if (strcmp(given.host, related.host) != 0)

										{

											related.absolute = 0;

											related.relative = 0;

											related.anchor = 0;

										}


									if (wanted & PARSE_PATH)

									{

										BOOL bOKToInheritPath = (!given.host);

										BOOL bRIsUNC = bIsFile && related.absolute && bIsUNC(related.absolute);

										BOOL bRIsDrive = bIsFile && related.absolute && bIsDrive(related.absolute);


										if (given.absolute)

										{

											BOOL bBack = FALSE;


										// 	All is given, except for perhaps the drive or unc root

											if (bIsFile)

											{

												BOOL bGIsUNC = given.absolute && bIsUNC(given.absolute);

												BOOL bGIsDrive = given.absolute && bIsDrive(given.absolute);


												bBack = bRIsUNC || bRIsDrive || bGIsUNC || bGIsDrive || strchr(given.absolute, '\\');

												if ((!(bGIsUNC || bGIsDrive)) && (bRIsUNC || bRIsDrive))

												{

													const char *e = NULL;


													if (bRIsDrive)

													{

														e = related.absolute + 2;

													}

													else

													{

														if (e = strchr(related.absolute + 2, '\\'))

															e = strchr(e + 1, '\\');

													}

													if (e)

														GTR_strncat(result, related.absolute, e - related.absolute);

												}

											}

											if (((!bIsFile) || !bIsAbsolute(given.absolute)) && (wanted & PARSE_PUNCTUATION))

												strcat(result, bBack ? "\\" : "/");

											strcat(result, given.absolute);

										}

										else if (related.absolute && bOKToInheritPath)

										{						/* Adopt path not name */

											char slash1 = '/';

											char slash2 = slash1;

											BOOL bBack = FALSE;


											if (bIsFile)

											{

												slash2 = '\\';

												bBack = bRIsUNC || bRIsDrive || strchr(related.absolute, '\\');

											}

											if ((!bIsFile) || !bIsAbsolute(related.absolute))

												strcat(result, bBack ? "\\" : "/");

											strcat(result, related.absolute);


											if (given.relative)

											{

												p = strchr(result, '?');	/* Search part? */

												if (!p)

													p = result + strlen(result) - 1;

												for (; *p != slash1 && *p != slash2; p--) ;	/* last / */

												p[1] = 0;		/* Remove filename */

												strcat(result, given.relative);		/* Add given one */

												HTSimplify(result);

											}

										}

										else if (given.relative)

										{

											/* The following 3 lines were copied from NCSA Mosaic for Windows */

											if ((wanted & PARSE_HOST) && (given.host || related.host) && (wanted & PARSE_PUNCTUATION))

												if (result[strlen(result) - 1] != '/')

													strcat(result, "/");

											strcat(result, given.relative);		/* what we've got */

										}

										else if (related.relative && bOKToInheritPath)

										{

											strcat(result, related.relative);

										}

										else

										{						/* No inheritance */

											strcat(result, "/");

										}

									}


									if (wanted & PARSE_ANCHOR)

										if (given.anchor || related.anchor)

										{

											if (wanted & PARSE_PUNCTUATION)

												strcat(result, "#");

											strcat(result, given.anchor ? given.anchor : related.anchor);

										}


									/* We truncate URLs to 1024 bytes if they're too long. */

									result[MAX_URL_STRING] = '\0';

									return_value = GTR_strdup(result);


									return return_value;		/* exactly the right length */

								}


								/*

								   **   As strcpy() but guaranteed to work correctly

								   **   with overlapping parameters.    AL 7 Feb 1994

								 */

								PRIVATE void ari_strcpy(char *to, char *from)

								{

									char *tmp;


									if (!to || !from)

										return;


									tmp = (char *) GTR_MALLOC(strlen(from) + 1);

									if (tmp)

									{

										strcpy(tmp, from);

										strcpy(to, tmp);

										GTR_FREE(tmp);

									}

									else

									{

										/* TODO */

									}

								}

								/*          Simplify a filename

								   //       -------------------

								   //

								   // A unix-style file is allowed to contain the seqeunce xxx/../ which may be

								   // replaced by "" , and the seqeunce "/./" which may be replaced by "/".

								   // Simplification helps us recognize duplicate filenames.

								   //

								   //   Thus,   /etc/junk/../fred   becomes /etc/fred

								   //       /etc/junk/./fred    becomes /etc/junk/fred

								   //

								   //      but we should NOT change

								   //       http://fred.xxx.edu/../..

								   //

								   //   or  ../../albert.html

								   //

								   //	CMF 5/26/95.  Note that many servers now bounce requests like

								   //	http://fred.xxx.edu/../fred.gif with a 403 for security.

								 */

								PUBLIC void HTSimplify(char *filename)

								{

									char *p = filename;

									char *q;


									if (p)

									{

										while (*p && (*p == '/' || *p == '.'))	/* Pass starting / or .'s */

											p++;

										while (*p)

										{

											if (*p == '/')

											{

												if ((p[1] == '.') && (p[2] == '.') && (p[3] == '/' || !p[3]))

												{

													for (q = p - 1; (q >= filename) && (*q != '/'); q--) ;	/* prev slash */

													if (q[0] == '/' && 0 != strncmp(q, "/../", 4))

													{

														if (!(q - 1 > filename && q[-1] == '/'))

														{

															ari_strcpy(q, p + 3);	/* Remove  /xxx/..  */

															if (!*filename)

																strcpy(filename, "/");

															p = q - 1;	/* Start again with prev slash  */

														}

														else

														{

															ari_strcpy(p, p + 3);	/* Remove starting ../ */

															p = p - 1; /* Start over with rest of path */

														}

													}

													else

													{			/*   xxx/.. leave it!   */

								#ifdef BUG_CODE

														ari_strcpy(filename, p[3] ? p + 4 : p + 3);		/* rm  xxx/../ */

														p = filename;	/* Start again */

								#endif

													}

												}

												else if ((p[1] == '.') && (p[2] == '/' || !p[2]))

												{

													ari_strcpy(p, p + 2);	/* Remove a slash and a dot */

												}

								#if 0

												else if (p[-1] != ':')

												{

													while (p[1] == '/')

													{

														ari_strcpy(p, p + 1);	/* Remove multiple slashes */

													}

												}

								#endif

											}

											p++;

										}						/* end while (*p) */

									}							/* end if (p) */

								}


								/*      Escape undesirable characters using %       HTEscape()

								   **       -------------------------------------

								   **

								   **   This function takes a pointer to a string in which

								   **   some characters may be unacceptable unescaped.

								   **   It returns a string which has these characters

								   **   represented by a '%' character followed by two hex digits.

								   **

								   **   Unlike HTUnEscape(), this routine returns a malloced string.

								 */


								PRIVATE CONST unsigned char isAcceptable[96] =


								/*   Bit 0       xalpha      -- see HTFile.h

								**   Bit 1       xpalpha     -- as xalpha but with plus.

								**   Bit 2 ...   path        -- as xpalphas but with /

								*/

								/*   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */

									{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 7, 7, 4,	/* 2x   !"#$%&'()*+,-./  */

									 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0,	/* 3x  0123456789:;<=>?  */

									 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,	/* 4x  @ABCDEFGHIJKLMNO  */

									 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7,	/* 5X  PQRSTUVWXYZ[\]^_  */

									 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,	/* 6x  `abcdefghijklmno  */

									 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0};	/* 7X  pqrstuvwxyz{\}~  DEL */


								PRIVATE char *hex = "0123456789ABCDEF";


								PUBLIC char *HTEscape(CONST char *str, unsigned char mask, char protect)

								{

								#define ACCEPTABLE(a)	((a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) || a == protect)

									CONST char *p;

									char *q;

									char *result;

									int unacceptable = 0;


									for (p = str; *p; p++)

										if (!ACCEPTABLE((unsigned char) (*p)))

											unacceptable++;

									result = (char *) GTR_MALLOC(p - str + unacceptable + unacceptable + 1);

									if (result)

									{

										for (q = result, p = str; *p; p++)

										{

											unsigned char a = *p;

											if (!ACCEPTABLE(a))

											{

												*q++ = HEX_ESCAPE;	/* Means hex commming */

												*q++ = hex[a >> 4];

												*q++ = hex[a & 15];

											}

											else

												*q++ = *p;

										}

										*q++ = 0;					/* Terminate */

									}

									return result;

								}


								/*      Decode %xx escaped characters           HTUnEscape()

								   **       -----------------------------

								   **

								   **   This function takes a pointer to a string in which some

								   **   characters may have been encoded in %xy form, where xy is

								   **   the acsii hex code for character 16x+y.

								   **   The string is converted in place, as it will never grow.

								 */


								PRIVATE char from_hex(char c)

								{

									return c >= '0' && c <= '9' ? c - '0'

										: c >= 'A' && c <= 'F' ? c - 'A' + 10

										: c - 'a' + 10;			/* accept small letters just in case */

								}


								PUBLIC char *HTUnEscape(char *str)

								{

									char *p = str;

									char *q = str;

									while (*p)

									{

										if (*p == HEX_ESCAPE)

										{

											p++;

											if (*p)

												*q = from_hex(*p++) * 16;

											if (*p)

												*q = *q + from_hex(*p++);

											q++;

										}

										else

										{

											*q++ = *p++;

										}

									}


									*q++ = 0;

									return str;


								}								/* HTUnEscape */