Страницы

Translate

среда, 23 октября 2013 г.

Exercise 6.3. Write a cross-referencer that prints a list of all words in a document, and for each word, a list of the line numbers on which it occurs.



Exercise 6.3. Write a cross-referencer that prints a list of all words in a document, and for each word, a list of the line numbers on which it occurs. Remove noise words like ``the,'' ``and,'' and so on.


#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

#define MAXWORD 100

struct linklist                     //a list of line numbers
{
    int lnum;
    struct  linklist *ptr;
};

struct tnode                      //the tree node
 { 
    char *word;                    //points to text
    struct linklist *lines; //numbers string
    struct tnode *left;   //left child
    struct tnode *right;  //right child
};

struct tnode *addtree(struct tnode *, char *, int);
void treeprint(struct tnode *);
int getword(char *, int);
int commonwords(char *);



/* printing program cross-references */
int main(int argc, char **argv)
{
    struct tnode *root;
    char word[MAXWORD];
    int linenum = 1;
    
    root = NULL;
    while(getword(word, MAXWORD) != EOF)
        if(isalpha(word[0]) && commonwords(word) == -1)
            root = addtree(root, word, linenum);
        else if(word[0] == '\n')
            linenum++;
    treeprint(root);
    return 0;
}

struct tnode *talloc(void);
struct linklist *lalloc(void);
char *s_dup(char *s);
void addline(struct tnode *, int);

/* addtree: add a node with w, at or below p */ 
struct tnode *addtree(struct tnode *p, char *w, int linenum)
{
    int cond;
    
    if(p == NULL) //a new word has arrived
    {
        p = talloc(); //make a new node
        p->word = s_dup(w);
        p->lines = lalloc();
        p->lines->lnum = linenum;
        p->lines->ptr = NULL;
        p->left = p->right = NULL;
    }
    else if((cond = strcmp(w, p->word)) == 0)
        addline(p, linenum);
    else if(cond < 0)
        p->left = addtree(p->left, w, linenum);
    else
        p->right = addtree(p->right, w, linenum);
    return p;
}

/* addline: add the line number to the linked list */
void addline(struct tnode *p, int linenum)
{
    struct linklist *temp;
    
    temp = p->lines;
    while(temp->ptr != NULL && temp->lnum != linenum)
        temp = temp->ptr;
    if(temp->lnum != linenum)
    {
        temp->ptr = lalloc();
        temp->ptr->lnum = linenum;
        temp->ptr->ptr = NULL;
    }
}   

/* treeprint: print a tree of order p */
void treeprint(struct tnode *p)
{
    struct linklist *temp;
    
    if(p != NULL)
    {
        treeprint(p->left);
        printf("%10s:", p->word);
        for(temp = p->lines; temp != NULL; temp = temp->ptr)
            printf("%4d", temp->lnum);
        printf("\n");
        treeprint(p->right);
    }
}

/* lalloc: create a node of a linked list */
struct linklist *lalloc(void)
{
    return(struct linklist *) malloc(sizeof(struct linklist));
}

/* talloc: make a tnode */
struct tnode *talloc(void)
{
    return(struct tnode *) malloc(sizeof(struct tnode));
}

/* make a duplicate of s */
char *s_dup(char *s)
{
    char *p;
    
    p = (char *)malloc(strlen(s)+1); // +1 for '\0'
    if(p != NULL)
        strcpy(p, s);
    return p;
}

/* commonwords: identify common words */
int commonwords(char *w)
{
    static char *cw[] = {"a", "an", "are", "in", "is", "of", 
                        "or", "that", "the", "this", "to"};
    int cond, mid;
    int low = 0;
    int high = sizeof(cw)/sizeof(char *) - 1;
    
    while(low <= high)
    {
        mid = (low + high)/2;
        if((cond = strcmp(w, cw[mid])) < 0)
            high = mid -1;
        else if(cond > 0)
            low = mid + 1;
        else
            return mid;
    }
        return -1;
}

#define BUFSIZE 100

char buf[BUFSIZE];          //buffer for ungetch;
int bufp = 0;               //next free position in buf

int getch(void) // get a (possibly pushed-back) character  
{
   return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c) // push character back on input
{
    if(bufp >= BUFSIZE)
        printf("ungetch: too many characnters\n");
    else
        buf[bufp++] = c;
}

/* getword: get next word or character from input */
int getword(char *word, int lim)
{
    int c, d; 
    int getch(void);
    void ungetch(int);
    int in_comment(void);
    char *w = word;
    
    while(isspace(c = getch()) && c != '\n')
        ;
    if(c != EOF)
        *w++ = c;
    if(isalpha(c) || c == ' ' || c == '#')
    {
        for(;--lim > 0; w++)
            if(!isalnum(*w= getch()) && *w != '_')
            {
                ungetch(*w);
                break;
            }
    }
    else if(c=='\'' || c == '"')
    {
        for(; --lim > 0; w++)
            if((*w = getch()) == '\\')
                *++w = getch();
            else if(*w == c)
            {
                w++;
                break;
            }
            else if(*w == EOF)
                break;
    }
    else if(c == '/')
    {
        if(((d = getch()) == '*'))
            c = in_comment();
        else
            ungetch(d);
    }
    *w = '\0';
    return c;
}

/* comment */
int in_comment(void)
{
    int c;
    
    while((c = getch()) != EOF)
        if(c == '*')
        {
            if((c = getch()) == '/')
                break;
            else
                ungetch(c);
        }
    return c;
}

Комментариев нет:

Отправить комментарий