Up: Getting Started with Shore Previous: Using Indexes

Appendix: Program Sources

stree.sdl

module stree {
	// interfaces defined here
	interface SearchTree;	// the top-level construct
	interface Word;			// a binary search tree node -- represents one word
	interface Document;		// a document stored in the repository
	interface Cite;			// a reference to a line in a document

	// A binary search tree of Word objects
	interface SearchTree {
	  private:
		attribute ref<Word> root;	// the root of the tree

		// Update the entry matching WORD to add a citation.
		// Add a new Word if necessary.
		void insert(in lref<char> word, in ref<Cite> cite);

	  public:
		// Constructor:  make an empty tree
		void initialize();

		// Insert a new Document into the repository.  The argument is a
		// pathname to be interpreted in the Unix name space as the name of a
		// Unix file containing the raw data.  A new Document object with
		// the same base name is created in the current Shore directory,
		// filled with a copy of the file's context, and indexed by all of
		// its words.
		void insert_file(in lref<char> src);

		// Retrieve the Word object matching the argument.
		// Return NULL if not found.
		ref<Word> find(in lref<char> word) const;
	};

	// There is one Word object for each distinct word appearing in any
	// document in the repository.
	interface Word {
	private:
		attribute string value;
		attribute ref<Word> left, right;
	public:
		relationship set<Cite> cited_by inverse cites;
		// Constructor: empty occurrences list
		void initialize(in lref<char> word);

		// How many occurrences?
		long count() const;

		// Get ith occurrence (returns NULL if not that many)
		ref<Cite> occurrence(in long i) const;

		// The following methods are meant to be used only by SearchTree.

		// Find decendant matching WORD creating one if necessary
		ref<Word> find_or_add(in lref<char> word);

		// Find only, return NULL on not found
		ref<Word> find(in lref<char> word) const;

		// Debugging dump
		void print(in long verbose) const;

		// Add an occurence
		void occurs_on(in ref<Cite> cite);
	};

	// A Cite object represents a citation.  There is one Cite object for each
	// line of each document in the repository.  There is thus a many-many
	// relationship from Cite to Word and a many-one relationship from Cite
	// to Document.
	interface Cite {
	private:
		attribute long offset;
	public:
		relationship ref<Document> doc inverse cited_by;
		relationship set<Word> cites inverse cited_by;
		// Constructor
		void initialize(in ref<Document> d, in long o);

		// Print the referenced line
		void print(in long vflag) const;

		// Destructor
		void finalize();
	};

	// A Document is a chunk of text that looks like a Unix file.
	// We also record the file name under which it was created.
	// (The need to record the name may go away when Shore adds a way to find
	// the pathname of a registered object given a Ref to it.)
	interface Document {
	private:
		attribute text body;
		attribute string name;
		attribute long cur_len;
	public:
		relationship set<Cite> cited_by inverse doc;
		// Constructor:  The body is empty.
		void initialize(in lref<char> base_name, in long len);

		// Add some text to the end of the body.
		void append(in lref<char> str);

		// Read-only access to the file name.
		lref<char> get_name() const;

		// Current length of text
		long size() const;

		// Print a line starting at OFFSET
		void print_line(in long offset) const;

		// Destructor
		void finalize();
	};

}

main.C

/*
 * ShoreConfig.h is needed only by applications
 * that distinguish platforms.  (Stree does not,
 * but we include this for documentation purposes.)
 */
#include <ShoreConfig.h>

#include <iostream.h>
#include <fstream.h>
#include <std.h>
#include "stree.h"

Ref<SearchTree> repository;
Ref<Pool> nodes;		// Place to create new anonymous objects

const char *DEMO_DIR = "stree";

char *argv0;
int verbose;
extern "C" int optind;

enum OPERATION {
	OP_NONE, OP_ADD, OP_LIST, OP_DEL, OP_POOL_LIST, OP_CLEAR
} operation = OP_NONE;

static void add_files(int argc, char *const*argc);
static void list_files(char *str);
static void delete_files(int argc, char *const*argc);
static void pool_list();
static void clear_all();

void usage() {
	cerr << "usage:" << endl;
	cerr << "\t" << argv0 << " -a[V] fname [fname ...]" << endl;
	cerr << "\t" << argv0 << " -l[V] word" << endl;
	cerr << "\t" << argv0 << " -d fname [fname ...]" << endl;
	cerr << "\t" << argv0 << " -p" << endl;
	cerr << "\t" << argv0 << " -c" << endl;
	cerr << "\t" << "the -V option turns on verbose mode" << endl;
	exit(1);
}

int main(int argc, char *argv[]) 
{
	argv0 = argv[0];
	shrc rc;

	// initialize connection to server
	SH_DO(Shore::init(argc, argv, 0, getenv("STREE_RC")));

	// get command-line options
	int c;
	while ((c = getopt(argc,argv,"aldpcV")) != EOF) switch(c) {
		case 'a': operation = OP_ADD; break;
		case 'l': operation = OP_LIST; break;
		case 'd': operation = OP_DEL; break;
		case 'p': operation = OP_POOL_LIST; break;
		case 'c': operation = OP_CLEAR; break;
		case 'V': verbose++; break;
		default: usage();
	}

	if (operation == OP_NONE)
		usage();

	// Start a transaction for initialization
	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal(); // this terminates the program with extreme prejudice

	// Check that our demo directory exists
	rc = Shore::chdir(DEMO_DIR);
	if (rc != RCOK) {
		if (rc != RC(SH_NotFound))
			SH_ABORT_TRANSACTION(rc);

		// Not found.  Must be the first time through.
		// Create the directory
		SH_DO(Shore::mkdir(DEMO_DIR, 0755));
		SH_DO(Shore::chdir(DEMO_DIR));

		// Make a new SearchTree object ...
		repository = new("repository", 0644) SearchTree;
		repository.update()->initialize();

		// ... and a pool for allocating Nodes.
		SH_DO(nodes.create_pool("pool", 0644, nodes));
	} else { // not first time

		// Get the repository root from the database ...
		SH_DO(Ref<SearchTree>::lookup("repository",repository));

		// ... and the pool for creating nodes
		SH_DO(nodes.lookup("pool", nodes));
	}

	SH_DO(SH_COMMIT_TRANSACTION);

	switch (operation) {
		case OP_ADD:
			add_files(argc-optind, argv+optind);
			break;
		case OP_LIST:
			if (optind != argc-1)
				usage();
			list_files(argv[optind]);
			break;
		case OP_DEL:
			delete_files(argc-optind, argv+optind);
			break;
		case OP_POOL_LIST:
			pool_list();
			break;
		case OP_CLEAR:
			clear_all();
			break;
		default: break;
	}

	return 0;
} // main

// Add all the named files to the repository
static void add_files(int argc, char *const*argv) {
	shrc rc;

	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal();
	for (int i=0; i<argc; i++)
		repository.update()->insert_file(argv[i]);
	if (verbose)
		cout << "about to commit" << endl;
	SH_DO(SH_COMMIT_TRANSACTION);
	if (verbose)
		cout << "committed" << endl;
} // add_files

// List all uses of a word
static void list_files(char *str) {
	shrc rc;
	int occurrences=0;

	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal();
	Ref<Word> w = repository->find(str);
	if (verbose)
		cout << "========== " << str << endl;
	if (w && w->count() > 0) {
		Ref<Cite> c;
		for (int i=0; c = w->occurrence(i); i++) {
			occurrences++;
			c->print(verbose);
		}
	} else if (verbose) {
		cout << "**** Not found" << endl;
		occurrences = -1;
	}
	if(occurrences >= 0 && verbose) {
		cout << "**** " << occurrences << " citation"
			<< (char *)(occurrences==1?"":"s") << endl;
	}
	SH_DO(SH_COMMIT_TRANSACTION);
} // list_files

// Removed the named files from the repository
static void delete_files(int argc, char *const*argv) {
	shrc rc;

	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal();

	for (int i=0; i<argc; i++) {
		Ref<Document> d;
		SH_DO(d.lookup(argv[i],d));
		d.update()->finalize();
		SH_DO(Shore::unlink(argv[i]));
	}
	if (verbose)
		cout << "about to commit" << endl;
	SH_DO(SH_COMMIT_TRANSACTION);
	if (verbose)
		cout << "committed" << endl;
} // delete_files

static void pool_list() {
	shrc rc;

	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal();

	Ref<any> ref;
	Ref<Word> w;
	Ref<Cite> c;
	{
		PoolScan scan("pool");
		if (scan != RCOK)
			SH_ABORT_TRANSACTION(scan.rc());

		while (scan.next(ref, true) == RCOK) {
			if (w = TYPE_OBJECT(Word).isa(ref)) {
				w->print(1);
			}
			else if (c = TYPE_OBJECT(Cite).isa(ref)) {
				c->print(2);
			}
			else cout << " Unknown type of object" << endl;
		}
	}
	SH_DO(SH_COMMIT_TRANSACTION);
} // pool_list

static void clear_all() {
	shrc rc;

	SH_BEGIN_TRANSACTION(rc);
	if (rc)
		rc.fatal();

	rc = Shore::unlink("repository");
	if (rc)
		cout << rc << endl;

	SH_DO(nodes.lookup("pool", nodes));
	SH_DO(nodes.destroy_contents());
	rc = Shore::unlink("pool");
	if (rc)
		cout << rc << endl;

	rc = Shore::chdir("..");
	if (rc)
		cout << rc << endl;

	SH_DO(Shore::rmdir(DEMO_DIR));

	SH_DO(SH_COMMIT_TRANSACTION);
} // clear_all

tree.C

// Member functions of the SearchTree class
#include <iostream.h>
#include <string.h>
#include <ctype.h>
#include "stree.h"
#include <sys/types.h>
#include <sys/stat.h>

extern Ref<Pool> nodes;
static int getword(const char *&p, char *res, int size);

extern int verbose; // defined in main.C

void SearchTree::initialize() {
	root = NULL;
}

void SearchTree::insert(char *s, Ref<Cite> c) {
	Ref<Word> w;
	if (root) {
		w = root.update()->find_or_add(s);
	}
	else {
		root = new(nodes) Word;
		root.update()->initialize(s);
		w = root;
	}
	w.update()->occurs_on(c);
}

void SearchTree::insert_file(char *fname) {
	shrc rc;

	if (verbose)
		cout << "Indexing file " << fname << endl;

	// Open input file
	ifstream in(fname);
	if (!in) {
		perror(fname);
		SH_ABORT_TRANSACTION(rc);
	}
	// do a unix stat to get the total size of the file.
	struct stat in_stat;
	if (stat(fname,&in_stat)) {
		perror(fname);
		SH_ABORT_TRANSACTION(rc);
	}

	// Create target document

	// Strip leading path from file name;
	char *base_name = strrchr(fname, '/');
	if (base_name)
		base_name++;
	else
		base_name = fname;

	Ref<Document> doc;
	rc = doc.new_persistent(base_name, 0644, doc);
	if (rc) {
		perror(base_name);
		SH_ABORT_TRANSACTION(rc);
	}
	doc.update()->initialize(base_name,in_stat.st_size);

	// for each line of the document ...
	char linebuf[1024];
	while (in.getline(linebuf, sizeof linebuf -1)) {
		long off = doc->size();

		// copy the line to the body of the document
		doc.update()->append(linebuf);
		doc.update()->append("\n");

		// allocate a new Cite object for this line
		Ref<Cite> cite = new (nodes) Cite;
		cite.update()->initialize(doc, off);

		// for each word on the line ...
		char word[100];
		const char *p = linebuf;
		while (getword(p, word, sizeof word)) {
			// link the citation to the word
			insert(word, cite);
		}
	}
}

Ref<Word> SearchTree::find(char *str) const {
	if (root)
		return root->find(str);
	return NULL;
}

// Copy a word of at most SIZE characters (including terminating null)
// in to the buffer starting at RES.  Start searching at location P.
// Words are delimited by white space.  The result is translated to lower
// case, with all non-letters eliminated.
// P is updated to point to the first character not copied.
// The result is 1 if a word is found, 0 if '\0' is encountered first.
static int getword(const char *&p, char *res, int size) {
	for (;; ) {
		// skip leading white space
		while (isspace(*p))
			p++;

		// check for eoln
		if (*p == 0)
			return 0;

		// gather non-space characters, translating to lower case and
		// ignoring non-alpha characters
		int len;
		for (len = 0; len < size-1 && *p && !isspace(*p); p++) {
			if (isupper(*p))
				res[len++] = tolower(*p);
			else if (islower(*p))
				res[len++] = *p;
		}
		if (len > 0) {
			res[len] = 0;
			return 1;
		}
		// otherwise, word was all digits and punctuation, so try again.
	}
}

word.C

// Member functions of the Word class
#include <iostream.h>
#include <string.h>
#include "stree.h"

extern Ref<Pool> nodes;

void Word::initialize(char *word) {
	value = word;
	left = NULL;
	right = NULL;
}

long Word::count() const {
	return cited_by.get_size();
}

Ref<Cite> Word::occurrence(long i) const {
	return cited_by.get_elt(i);
}

Ref<Word> Word::find_or_add(char *s) {
	int i = strcmp(s,value);
	if (i == 0)
		return this;
	if (i < 0) {
		if (left) return left.update()->find_or_add(s);
		else {
			left = new(nodes) Word;
			left.update()->initialize(s);
			return left;
		}
	}
	else {
		if (right) return right.update()->find_or_add(s);
		else {
			right = new(nodes) Word;
			right.update()->initialize(s);
			return right;
		}
	}
}

Ref<Word> Word::find(char *s) const {
	int i = strcmp(s,value);
	if (i == 0) return this;
	if (i < 0) return left ? left->find(s) : (Ref<Word>)NULL;
	return right ? right->find(s) : (Ref<Word>)NULL;
}

void Word::occurs_on(Ref<Cite> cite) {
	cited_by.add(cite);
}

void Word::print(long verbose) const {
	if (verbose) {
		int s = cited_by.get_size();
		cout << "Word '" << (char *)value
			<< "' occurs on " << s << " line" << (s==1 ? "" : "s") << endl;
	}
	else cout << (char *)value;
}

cite.C

// Member functions of the Cite class
#include <iostream.h>
#include "stree.h"

void Cite::initialize(Ref<Document> d, long o) {
	doc = d;
	offset = o;
}

void Cite::print(long v) const {
	switch (v) {
		default:
		case 0: // just the file name
			cout << doc->get_name() << endl;
			break;
		case 1: // the file name and the corresponding line
			cout << doc->get_name() << ": ";
			doc->print_line(offset);
			break;
		case 2: // debugging version
			cout << "Cite, offset " << offset
				<< " in file " << doc->get_name()
				<< " cites";
			{
				int count = cites.get_size();
				for (int i = 0; i < count; i++) {
					Ref<Word> w = cites.get_elt(i);
					cout << " ";
					w->print(0);
				}
				cout << endl;
			}
			break;
	}
}

void Cite::finalize() {
	while (cites.delete_one()) {}
}

document.C

// Member functions of the Document class
#include <iostream.h>
#include <string.h>
#include "stree.h"

void Document::initialize(char *base_name, long ilen) {
	body = 0;
	name = base_name;
	// set a char at the end of the body to initialze the
	// string space.
	body.set(ilen-1,0);
	// initialize cur_len.
	cur_len = 0;
}

void Document::append(char *str) {
	// body.set(str, body.blen(), ::strlen(str));
	int str_size = ::strlen(str);
	body.set(str, cur_len, str_size);
	cur_len += str_size;
}

char *Document::get_name() const {
	return name;
}

long Document::size() const {
	// return body.strlen();
	return cur_len;
}

void Document::print_line(long offset) const {
	char buf[100];

	body.get(buf, offset, sizeof buf);
	buf[sizeof buf - 1] = 0;
	char *p = strchr(buf, '\n');
	if (p) *++p = 0;
	cout << buf;
}

void Document::finalize() {
	Ref<Cite> p;
	while (p = cited_by.delete_one()) {
		p.update()->finalize();
		SH_DO(p.destroy());
	}
}

stree_defs.C

#define MODULE_CODE
#include "stree.h"

swc

#!/bin/sh
# --------------------------------------------------------------- #
# -- Copyright (c) 1994, 1995 Computer Sciences Department,    -- #
# -- University of Wisconsin-Madison, subject to the terms     -- #
# -- and conditions given in the file COPYRIGHT.  All Rights   -- #
# -- Reserved.                                                 -- #
# --------------------------------------------------------------- #
# $Header: /p/shore/shore_cvs/src/examples/stree/swc,v 1.3 1995/04/26 11:03:06 solomon Exp $

mountpoint=/shoremnt
program=stree

if test $1x = -ix
then
	program=doc_index
	shift
fi

if test $# -ne 1
then
	echo "usage: $0 [-i] keyword"
	exit 1
fi

prog_path=`pwd`/$program

cd $mountpoint/$program
wc `$prog_path -l $1 | sort -u`

sedit

#!/bin/sh
# --------------------------------------------------------------- #
# -- Copyright (c) 1994, 1995 Computer Sciences Department,    -- #
# -- University of Wisconsin-Madison, subject to the terms     -- #
# -- and conditions given in the file COPYRIGHT.  All Rights   -- #
# -- Reserved.                                                 -- #
# --------------------------------------------------------------- #
# $Header: /p/shore/shore_cvs/src/examples/stree/sedit,v 1.3 1995/04/26 11:03:03 solomon Exp $

mountpoint=/shoremnt
program=stree

if test $1x = -ix
then
	program=doc_index
	shift
fi

if test -t 0
then
	edit=${EDITOR:-emacs}
else
	edit="echo EDIT"
fi

if test $# -ne 1
then
	echo "usage: $0 [-i] keyword"
	exit 1
fi

prog_path=`pwd`/$program

files="`$prog_path -l $1`" || exit 1
if [ -n "$files" ]
then
	cd $mountpoint/$program
	$edit $files
else
	echo $1 not found
fi

docIndex

// Member functions of the DocIndex class
#include <iostream.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "doc_index.h"

extern Ref<Pool> nodes;
static int getword(const char *&p, char *res, int size);

extern int verbose; // defined in main.C

void DocIndex::initialize() {
	SH_DO(ind.init(UniqueBTree));
}

void DocIndex::insert(char *s, Ref<Cite> c) {
	Ref<Word> w;
	bool found;

	SH_DO(ind.find(s,w,found));
	if (!found) {
		w = new(nodes) Word;
		w.update()->initialize(s);
		SH_DO(ind.insert(s,w));
	}
	w.update()->occurs_on(c);
}

void DocIndex::insert_file(char *fname) {
	shrc rc;

	if (verbose)
		cout << "Indexing file " << fname << endl;

	// Open input file
	ifstream in(fname);
	if (!in) {
		perror(fname);
		SH_ABORT_TRANSACTION(rc);
	}
	// do a unix stat to get the total size of the file.
	struct stat in_stat;
	if (stat(fname,&in_stat)) {
		perror(fname);
		SH_ABORT_TRANSACTION(rc);
	}

	// Create target document

	// Strip leading path from file name;
	char *base_name = strrchr(fname, '/');
	if (base_name)
		base_name++;
	else
		base_name = fname;

	Ref<Document> doc;
	rc = doc.new_persistent(base_name, 0644, doc);
	if (rc) {
		perror(base_name);
		SH_ABORT_TRANSACTION(rc);
	}
	doc.update()->initialize(base_name,in_stat.st_size);

	// for each line of the document ...
	char linebuf[1024];
	while (in.getline(linebuf, sizeof linebuf -1)) {
		long off = doc->size();

		// copy the line to the body of the document
		doc.update()->append(linebuf);
		doc.update()->append("\n");

		// allocate a new Cite object for this line
		Ref<Cite> cite = new (nodes) Cite;
		cite.update()->initialize(doc, off);

		// for each word on the line ...
		char word[100];
		const char *p = linebuf;
		while (getword(p, word, sizeof word)) {
			// link the citation to the word
			insert(word, cite);
		}
	}
}

Ref<Word> DocIndex::find(char *str) const {
	Ref<Word> w;
	bool found;

	SH_DO(ind.find(str,w,found));
	if (found)
		return w;
	return NULL;
}

// Copy a word of at most SIZE characters (including terminating null)
// in to the buffer starting at RES.  Start searching at location P.
// Words are delimited by white space.  The result is translated to lower
// case, with all non-letters eliminated.
// P is updated to point to the first character not copied.
// The result is 1 if a word is found, 0 if '\0' is encountered first.
static int getword(const char *&p, char *res, int size) {
	for (;; ) {
		// skip leading white space
		while (isspace(*p))
			p++;

		// check for eoln
		if (*p == 0)
			return 0;

		// gather non-space characters, translating to lower case and
		// ignoring non-alpha characters
		int len;
		for (len = 0; len < size-1 && *p && !isspace(*p); p++) {
			if (isupper(*p))
				res[len++] = tolower(*p);
			else if (islower(*p))
				res[len++] = *p;
		}
		if (len > 0) {
			res[len] = 0;
			return 1;
		}
		// otherwise, word was all digits and punctuation, so try again.
	}
}

void DocIndex::delete_word(sdl_string w) {
	int count;
	SH_DO(ind.remove(w,count));
	if (verbose)
		cout << "deleted " << count << (count==1 ? " copy" : " copies")
			<< " of word '" << (char *)w << "' from the index" << endl;
}

void DocIndex::print() const {
	// index_iter<typeof(ind)> iterator(ind);
	IndexScanIter<sdl_string,Ref<Word> > iterator(this->ind);
	SH_DO(iterator.next());
	while (!iterator.eof) {
		cout << "key: '" << iterator.cur_key << "' value: ";
		iterator.cur_val->print(1);
		SH_DO(iterator.next());
	}
}

Marvin Solomon
Fri Aug 2 13:40:31 CDT 1996