W               [ \t\r\n]
F               [-a-z0-9$_.!*(),%;/?:@&=+~|#]
K               [a-z0-9-]

%x DOCTYPE
%x COMMENT COMMENT_BAD
%x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
%x DQUOTED SQUOTED

%{
/***************************************
  $Header: /home/amb/wwwoffle/RCS/html.l 2.47 2000/02/14 19:20:09 amb Exp $

  WWWOFFLE - World Wide Web Offline Explorer - Version 2.5d.
  Parse the HTML and look for the images, links and other things.
  ******************/ /******************
  Written by Andrew M. Bishop
  Object handling by Walter Pfannenmller

  This file Copyright 1997,98,99,2000 Andrew M. Bishop
  It may be distributed under the GNU Public License, version 2, or
  any higher version.  See section COPYING of the GNU Public license
  for conditions under which this file may be redistributed.
  ***************************************/


#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include <sys/stat.h>
#include <unistd.h>
#include <time.h>

#include "wwwoffle.h"
#include "document.h"
#include "config.h"
#include "misc.h"

#include "errors.h"

/* Parser outputs */

#define LEX_PLAINTEXT  1
#define LEX_COMMENT    2
#define LEX_DOCTYPE    3

#define LEX_TAG_BEGIN  4
#define LEX_TAG_END    5

#define LEX_ATTR_KEY   6
#define LEX_ATTR_VAL   7

/* Tag types */

typedef enum _HTMLTags
{
 tag_a         = 0  /* "a"          */ ,
 tag_applet    = 1  /* "applet"     */ ,
 tag_area      = 2  /* "area"       */ ,
 tag_base      = 3  /* "base"       */ ,
 tag_blockquote= 4  /* "blockquote" */ ,
 tag_body      = 5  /* "body"       */ ,
 tag_del       = 6  /* "del"        */ ,
 tag_embed     = 7  /* "embed"      */ ,
 tag_frame     = 8  /* "frame"      */ ,
 tag_head      = 9  /* "head"       */ ,
 tag_iframe    =10  /* "iframes"    */ ,
 tag_img       =11  /* "img"        */ ,
 tag_input     =12  /* "input"      */ ,
 tag_ins       =13  /* "ins"        */ ,
 tag_link      =14  /* "link"       */ ,
 tag_meta      =15  /* "meta"       */ ,
 tag_object    =16  /* "object"     */ ,
 tag_param     =17  /* "param"      */ ,
 tag_q         =18  /* "q"          */ ,
 tag_script    =19  /* "script"     */ ,
 tag_xml       =20  /* "xml"        */ ,
 tag_ntags     =21
}
HTMLTags;

/* Tag strings */

static char *tags[]=
{
 /* tag_a         = 0  */  "a"          ,
 /* tag_applet    = 1  */  "applet"     ,
 /* tag_area      = 2  */  "area"       ,
 /* tag_base      = 3  */  "base"       ,
 /* tag_blockquote= 4  */  "blockquote" ,
 /* tag_body      = 5  */  "body"       ,
 /* tag_del       = 6  */  "del"        ,
 /* tag_embed     = 7  */  "embed"      ,
 /* tag_frame     = 8  */  "frame"      ,
 /* tag_head      = 9  */  "head"       ,
 /* tag_iframe    =10  */  "iframes"    ,
 /* tag_img       =11  */  "img"        ,
 /* tag_input     =12  */  "input"      ,
 /* tag_ins       =13  */  "ins"        ,
 /* tag_link      =14  */  "link"       ,
 /* tag_meta      =15  */  "meta"       ,
 /* tag_object    =16  */  "object"     ,
 /* tag_param     =17  */  "param"      ,
 /* tag_q         =18  */  "q"          ,
 /* tag_script    =19  */  "script"     ,
 /* tag_xml       =20  */  "xml"       
};

/* Attribute types */

typedef enum _HTMLAttributes
{
 att_archive   = 0  /* "archive"    */ ,
 att_background= 1  /* "background" */ ,
 att_cite      = 2  /* "cite"       */ ,
 att_classid   = 3  /* "classid"    */ ,
 att_code      = 4  /* "code"       */ ,
 att_codebase  = 5  /* "codebase"   */ ,
 att_codetype  = 6  /* "codetype"   */ ,
 att_content   = 7  /* "content"    */ ,
 att_data      = 8  /* "data"       */ ,
 att_href      = 9  /* "href"       */ ,
 att_http_equiv=10  /* "http-equiv" */ ,
 att_longdesc  =11  /* "longdesc"   */ ,
 att_name      =12  /* "name"       */ ,
 att_object    =13  /* "object"     */ ,
 att_profile   =14  /* "profile"    */ ,
 att_rel       =15  /* "rel"        */ ,
 att_src       =16  /* "src"        */ ,
 att_type      =17  /* "type"       */ ,
 att_usemap    =18  /* "usemap"     */ ,
 att_value     =19  /* "value"      */ ,
 att_valuetype =20  /* "valuetype"  */ ,
 att_natts     =21
}
HTMLAttributes;

/* Attribute strings. */

static char *attributes[]=
{
 /* att_archive   = 0 */  "archive"     ,
 /* att_background= 1 */  "background"  ,
 /* att_cite      = 2 */  "cite"        ,
 /* att_classid   = 3 */  "classid"     ,
 /* att_code      = 4 */  "code"        ,
 /* att_codebase  = 5 */  "codebase"    ,
 /* att_codetype  = 6 */  "codetype"    ,
 /* att_content   = 7 */  "content"     ,
 /* att_data      = 8 */  "data"        ,
 /* att_href      = 9 */  "href"        ,
 /* att_http_equiv=10 */  "http-equiv"  ,
 /* att_longdesc  =11 */  "longdesc"    ,
 /* att_name      =12 */  "name"        ,
 /* att_object    =13 */  "object"      ,
 /* att_profile   =14 */  "profile"     ,
 /* att_rel       =15 */  "rel"         ,
 /* att_src       =16 */  "src"         ,
 /* att_type      =17 */  "type"        ,
 /* att_usemap    =18 */  "usemap"      ,
 /* att_value     =19 */  "value"       ,
 /* att_valuetype =20 */  "valuetype"   ,
};


static void parse_html(void);

static char *html_yylval=NULL;
extern int html_yylex(void);


/*+ The refresh content of a Meta tag. +*/
static char *meta_refresh=NULL;

/*+ The content of a Base tag. +*/
static char *base_url=NULL;

/*+ The file descriptor that we are reading from. +*/
static int html_yyfd=-1;

/*+ The base URL of this page. +*/
static URL *baseUrl=NULL;

/*+ The quote character used. +*/
static char *quote="";


/*++++++++++++++++++++++++++++++++++++++
  Parse the HTML and look for references to image/links/frames.

  int fd The file descriptor of the file to parse.

  URL *Url The reference URL to use.
  ++++++++++++++++++++++++++++++++++++++*/

void ParseHTML(int fd,URL *Url)
{
 static int first=1;

 PrintMessage(Debug,"Parsing document using HTML parser.");

 baseUrl=Url;
 base_url=NULL;

 SetBaseURL(baseUrl);

 if(meta_refresh)
    free(meta_refresh);
 meta_refresh=NULL;

 html_yyfd=fd;
 if(!first)
    html_yyrestart(NULL);

 parse_html();

 if(base_url)
   {
    baseUrl=SplitURL(base_url);
    SetBaseURL(baseUrl);
   }

 first=0;
}


/*++++++++++++++++++++++++++++++++++++++
  Return the URL from the Meta Refresh tag if there is one.

  char *MetaRefresh Returns the new URL or NULL if none.
  ++++++++++++++++++++++++++++++++++++++*/

char *MetaRefresh(void)
{
 if(meta_refresh)
   {
    char *new=LinkURL(baseUrl,meta_refresh);
    if(new!=meta_refresh)
      {
       free(meta_refresh);
       meta_refresh=new;
      }
   }

 return(meta_refresh);
}


/*++++++++++++++++++++++++++++++++++++++
  Object and Param treatment:
  this is the attempt to extract all 
  valid URIs from the OBJECT or PARAM tag
  defined in HTML 4.0.

  there is a problem with inline data, classes, ..
  or
  URIs of the form
  java:...
  data:...
  clsid:...
  and inline data
 
  these will be taken care of later.

  (c) Walter Pfannenmueller
  ++++++++++++++++++++++++++++++++++++++*/

/* no more than obj_archives_max are accepted: seems enough */
#define obj_archives_max 32

/* Object */
enum {
    obj_classid = 0,
    obj_codetype,
    obj_codebase,
    obj_code,
    obj_object,
    obj_data,
    obj_usemap,
    obj_type,
    obj_longdesc,

    obj_archives_start,
    obj_archives_end = obj_archives_start + obj_archives_max,
    obj_parts_size
};

static int obj_codetype_Type = RefObject; 
static int obj_type_Type = RefObject; 
static int obj_narchives = 0;

static char *obj_parts[obj_parts_size] = { NULL, }; 

/* Param */

enum {
   param_type = 0,
   param_value,
   param_parts_size
};

static int param_valuetype_is_ref = 0;
static char *param_parts[param_parts_size] = { NULL, }; 


/*++++++++++++++++++++++++++++++++++++++
  Free one of the obj parts.
  ++++++++++++++++++++++++++++++++++++++*/
static void op_free(char **op)
{
    if(*op != NULL)
    {
       free(*op);
       *op = NULL;
    }
}

/*++++++++++++++++++++++++++++++++++++++
  Allocate one of the obj parts.
  ++++++++++++++++++++++++++++++++++++++*/
static void op_malloc(char **op,char *text)
{
    op_free(op);
    *op = (char *)malloc(strlen(text) + 1);
    strcpy(*op,text); 
} 

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and add codebase
  ++++++++++++++++++++++++++++++++++++++*/
static void codebase_url(int part, RefType refType)
{
    if(obj_parts[part])
    {
        if(obj_parts[obj_codebase])
        {
            char *url = (char *)malloc(strlen(obj_parts[obj_codebase]) + 
                                       sizeof("/") +
                                       strlen(obj_parts[part])+1);
            strcpy(url,obj_parts[obj_codebase]); 
            if(url[strlen(url) - 1] != '/')
            {
                strcat(url,"/"); 
            }
            strcat(url,obj_parts[part]);
            op_free(&obj_parts[part]);
            obj_parts[part] = url;
        }
        AddReference(obj_parts[part], refType);
    }
}

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and build an applet url
  ++++++++++++++++++++++++++++++++++++++*/
static void java_applet_url(int part, RefType refType)
{
    if(obj_parts[part])
    {
        char *dots;
        char *applet = obj_parts[part];
        static const char class_suffix[] = ".class";

        if(strcmp(&applet[strlen(applet) - strlen(class_suffix)],class_suffix))
        {
            applet = (char *)malloc(strlen(applet) + sizeof(class_suffix));
            strcpy(applet,obj_parts[part]); 
            strcat(applet,class_suffix); 
            op_free(&obj_parts[part]);
            obj_parts[part] = applet;
        }
        dots = applet;
        while((dots = strchr(dots,'.')) < (applet + strlen(applet) - sizeof(class_suffix)))
        {
           *dots = '/';
        }
        codebase_url(part,refType);
    }
}

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and build urls
  ++++++++++++++++++++++++++++++++++++++*/
static void build_obj_urls()
{
    int i;
    /* asuming, referenced objects from within objects
       are already in the archives */ 
    RefType refType = obj_narchives > 0 ? RefObject : RefInlineObject;
    if(obj_codetype_Type == RefImage || obj_type_Type == RefImage)
    {
        codebase_url(obj_classid,RefImage);
        codebase_url(obj_data,RefImage);
    }
    else
    {
        /* only change classid,data to .class - file if we are sure
           we do have a java object */ 
        if(obj_parts[obj_codetype] &&
           !strcmp(obj_parts[obj_codetype],"application/java")
        )
        {
           java_applet_url(obj_classid,refType);
           java_applet_url(obj_data,refType);
        }
        else
        {
           codebase_url(obj_classid,refType);
           codebase_url(obj_data,refType);
        }    
    }
    java_applet_url(obj_code,refType);
    java_applet_url(obj_object,refType);
    codebase_url(obj_usemap,RefLink);
    codebase_url(obj_longdesc,RefLink);
    for(i = 0; i < obj_narchives; i++)
    {
        codebase_url(i + obj_archives_start, RefObject);
    }
    for(i = 0; i < obj_parts_size; i++)
    {
       op_free(&obj_parts[i]);
    }
    obj_codetype_Type = RefObject; 
    obj_type_Type = RefObject; 
    obj_narchives = 0;
}


/*+++++++++++++++++++++++++++++++++++++++++
  take the param's info and build urls
  +++++++++++++++++++++++++++++++++++++++++*/
static void build_param_urls()
{
    int i;
    if(param_valuetype_is_ref && param_parts[param_value])
    {
        AddReference(param_parts[param_value],RefObject);
    } 
    param_valuetype_is_ref = 0;
    for(i = 0; i < param_parts_size; i++)
    {
       op_free(&param_parts[i]);
    }
}


/*++++++++++++++++++++++++++++++++++++++
  Parse the HTML and look for references to image/links/frames.
  ++++++++++++++++++++++++++++++++++++++*/

static void parse_html(void)
{
 HTMLTags tag=tag_ntags;
 HTMLAttributes key=att_natts;
 RefType ref;
 int link_rel_style=0,meta_http_equiv_refresh=0;
 int yychar;

 /* The actual parser. */

 while((yychar=html_yylex()))
    switch(yychar)
      {
      case LEX_PLAINTEXT:
       break;

      case LEX_COMMENT:
       break;

      case LEX_DOCTYPE:
       break;

      case LEX_TAG_BEGIN:
       for(tag=0;tag<tag_ntags;tag++)
          if(!strcasecmp(html_yylval,tags[tag]))
             break;
       break;

      case LEX_TAG_END:
       if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
          build_obj_urls();
       if(tag==tag_param)
          build_param_urls();

       tag=tag_ntags;
       key=att_natts;
       link_rel_style=0,meta_http_equiv_refresh=0;
       break;

      case LEX_ATTR_KEY:
       if(tag==tag_ntags)
          break;

       for(key=0;key<att_natts;key++)
          if(!strcasecmp(html_yylval,attributes[key]))
             break;
      break;

      case LEX_ATTR_VAL:
       if(key==att_natts)
          break;

       /* Simple links and stuff */

       ref=NRefTypes;

       if(key==att_href && (tag==tag_a || tag==tag_area))
          ref=RefLink;
       else if(key==att_src && (tag==tag_input || tag==tag_img))
          ref=RefImage;
       else if(key==att_src && tag==tag_script)
          ref=RefScript;
       else if(key==att_src && (tag==tag_frame || tag==tag_iframe))
          ref=RefFrame;
       else if(key==att_cite && (tag==tag_q || tag==tag_blockquote || tag==tag_ins || tag==tag_del))
          ref=RefLink;
       else if(key==att_background && tag==tag_body)
          ref=RefImage;
       else if(key==att_longdesc && (tag==tag_frame || tag==tag_iframe || tag==tag_img))
          ref=RefLink;
       else if(key==att_usemap && (tag==tag_input || tag==tag_img))
          ref=RefImage;
       else if(key==att_profile && tag==tag_head)
          ref=RefLink;

       if(ref!=NRefTypes)
         {AddReference(html_yylval,ref);break;}

       /* Other simple non-reference ones. */

       if(key==att_href && tag==tag_base)
         {base_url=(char*)malloc(strlen(html_yylval)+1); strcpy(base_url,html_yylval); break;}

       /* Some more complicated ones that depend on other attributes. */

       if(tag==tag_link)
          if(key==att_rel && !strncasecmp(html_yylval,"Stylesheet",10))
            {link_rel_style=1;break;}
          else if(key==att_href)
             if(link_rel_style)
               {AddReference(html_yylval,RefStyleSheet);break;}
             else
               {AddReference(html_yylval,RefLink);break;}

       if(tag==tag_meta)
         {
          if(key==att_http_equiv && !strncasecmp(html_yylval,"Refresh",7))
            {meta_http_equiv_refresh=1;break;}
          else if(key==att_content && meta_http_equiv_refresh)
            {
             char *p;

             /* ' *[0-9].?[0-9]* *[;,] *(URL *= *|)http://...' */

             p=html_yylval;
             while(isspace(*p)) p++;
             if(!isdigit(*p))
                break;
             while(isdigit(*p)) p++;
             if(*p=='.')
               {p++; while(isdigit(*p)) p++;}
             while(isspace(*p)) p++;
             if(*p!=';' && *p!=',')
                break;
             p++;
             while(isspace(*p)) p++;
             if(!strncasecmp(p,"URL",3))
               {
                p+=3;
                while(isspace(*p)) p++;
                if(*p!='=') break;
                p++;
                while(isspace(*p)) p++;
               }
             if(!*p)
                break;
             meta_refresh=(char*)malloc(strlen(p)+1); strcpy(meta_refresh,p);
             break;
            }
         }

       /* Complex object type ones. */

       if(tag==tag_param)
         {
          if(key==att_valuetype && !strcasecmp(html_yylval,"ref"))
             param_valuetype_is_ref = 1;
          else if(key==att_name && (!strcasecmp(html_yylval,"href") || !strcasecmp(html_yylval,"file") || !strcasecmp(html_yylval,"ref")))
             param_valuetype_is_ref = 1;

          else if(key==att_type)  op_malloc(&param_parts[param_type] ,html_yylval);
          else if(key==att_value) op_malloc(&param_parts[param_value],html_yylval);
         }

       else if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
         {
          if(key==att_src)
            {AddReference(html_yylval,RefInlineObject);break;}

          else if(key==att_archive)
            {
             char *p,*q=html_yylval;

             while((p=strtok(q," \t\r\n,")))
               {
                if(obj_narchives < obj_archives_max)
                   op_malloc(&obj_parts[obj_archives_start + obj_narchives++],p);
                q=NULL;
               }
             break;
            }

          else if(key==att_code)     op_malloc(&obj_parts[obj_code]    ,html_yylval);
          else if(key==att_object)   op_malloc(&obj_parts[obj_object]  ,html_yylval);
          else if(key==att_codebase) op_malloc(&obj_parts[obj_codebase],html_yylval);
          else if(key==att_data)     op_malloc(&obj_parts[obj_data]    ,html_yylval);
          else if(key==att_usemap)   op_malloc(&obj_parts[obj_usemap]  ,html_yylval);
          else if(key==att_longdesc) op_malloc(&obj_parts[obj_longdesc],html_yylval);
          else if(key==att_classid)
            {
             if(!strncasecmp(html_yylval,"java:",5))       op_malloc(&obj_parts[obj_classid],html_yylval+5);
             else if(!strncasecmp(html_yylval,"clsid:",6)) op_malloc(&obj_parts[obj_classid],html_yylval+6);
             if(strncasecmp(html_yylval,"data:",5))        op_malloc(&obj_parts[obj_classid],html_yylval);
            }
          else if(key==att_codetype)
            {
             if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_codetype],html_yylval+5); obj_codetype_Type = RefImage;}
             else                                     op_malloc(&obj_parts[obj_codetype],html_yylval);
            }
          else if(key==att_type)
            {
             if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_type],html_yylval+5); obj_type_Type = RefImage;}
             else                                     op_malloc(&obj_parts[obj_type],html_yylval);
            }
         }

       key=att_natts;
       break;

      default:
      }
}


#ifndef html_yywrap
/*+ Needed in lex but does nothing. +*/
#define html_yywrap() 1
#endif

/*+ Reset the current string. +*/
#define reset_string \
 *string=0; \
 stringused=0;

/*+ append information to the current string. +*/
#define append_string(xx) \
 newlen=strlen(xx); \
 if((stringused+newlen)>=stringlen) \
    string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1)); \
 strcpy(string+stringused,xx); \
 stringused+=newlen;

/*+ A macro to read data that can be used by the lexer. +*/
#define YY_INPUT(buf,result,max_size) \
        if((result=read_data(html_yyfd,buf,max_size))==-1) \
           result=0;

%}

%%
 char *string=malloc(128);
 int stringlen=128,stringused=0,newlen;

 /* Handle comments and other tags */

[^<]+                       { /* html_yylval=html_yytext; return(LEX_PLAINTEXT); */ }
"<!DOCTYPE"                 { BEGIN(DOCTYPE); reset_string; }
"<!--"                      { BEGIN(COMMENT); reset_string; }
"<!"{W}*"-"*                { BEGIN(COMMENT_BAD); reset_string; }
"<"{W}*                     { BEGIN(TAG_START); reset_string; /* append_string(html_yytext); */ }

 /* Doctype (DTD) */

<DOCTYPE>">"                { BEGIN(INITIAL); /* html_yylval=string; return(LEX_DOCTYPE); */ }
<DOCTYPE>[^>]+              { /* append_string(html_yytext); */ }

 /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
               COMMENT is not strictly correct, but works better than the real thing. */

<COMMENT>"--"{W}*">"        { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
<COMMENT>">"                { /* append_string(html_yytext); */ }
<COMMENT>"-"                { /* append_string(html_yytext); */ }
<COMMENT>[^->]+             { /* append_string(html_yytext); */ }

<COMMENT_BAD>">"            { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
<COMMENT_BAD>[^>]+          { /* append_string(html_yytext); */ }

 /* Tags */

<TAG_START>"/"?{K}+/({W}|">") { BEGIN(TAG); html_yylval=html_yytext; return(LEX_TAG_BEGIN); }
<TAG_START>(.|\n)           { BEGIN(INITIAL); }

<TAG>">"                    { BEGIN(INITIAL); html_yylval=""; return(LEX_TAG_END); }
<TAG>"<"                    { BEGIN(INITIAL); unput(html_yytext[0]); html_yylval=""; return(LEX_TAG_END); }
<TAG>{K}+                   { BEGIN(TAG_ATTR_KEY); html_yylval=html_yytext; return(LEX_ATTR_KEY); }
<TAG>(.|\n)                 { }

<TAG_ATTR_KEY>{W}*=         { BEGIN(TAG_ATTR_VAL); }
<TAG_ATTR_KEY>(.|\n)        { BEGIN(TAG); unput(html_yytext[0]); html_yylval=NULL; return(LEX_ATTR_VAL); }

<TAG_ATTR_VAL>\"            { BEGIN(DQUOTED); reset_string; }
<TAG_ATTR_VAL>\'            { BEGIN(SQUOTED); reset_string; }
<TAG_ATTR_VAL>{W}+          { }
<TAG_ATTR_VAL>{F}+          { BEGIN(TAG); html_yylval=html_yytext; quote=""; return(LEX_ATTR_VAL); }
<TAG_ATTR_VAL>(.|\n)        { BEGIN(TAG); unput(html_yytext[0]); html_yylval=""; quote=""; return(LEX_ATTR_VAL); }

 /* Quoted strings */

<DQUOTED>\\\"               { append_string(html_yytext); }
<DQUOTED>\\                 { append_string(html_yytext); }
<DQUOTED>\"                 { BEGIN(TAG); html_yylval=string; quote="\""; return(LEX_ATTR_VAL); }
<DQUOTED>(\r|\n)+           { }
<DQUOTED>[^\\\"\r\n]+       { append_string(html_yytext); }

<SQUOTED>\\\'               { append_string(html_yytext); }
<SQUOTED>\\                 { append_string(html_yytext); }
<SQUOTED>\'                 { BEGIN(TAG); html_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
<SQUOTED>(\r|\n)+           { }
<SQUOTED>[^\\\'\r\n]+       { append_string(html_yytext); }

 /* End of file */

<<EOF>>                     { free(string); BEGIN(INITIAL); return(0); }
%%
