In-memory XML data structures and functionality

Previous: Overview of the program and main() ] [ Top: wftk core index ] [ Next: xml_read: Using expat to parse XML files into memory ]

I define three basic datastructures for dealing with XML: the element, the attribute, and the element list. Each of these has a struct definition (_element, _attr, and _list respectively) and a pointer typedef (XML, ATTR, and ELEMENTLIST). My apologies that the names don't match up, but it makes sense to me: the struct names reflect a lower-level appreciation for what the objects are, while the typedefs relect a higher-level view of what they're to be used for. Yeah. Anyway, that's my left-brain rationalization for an essentially right-brained nomenclature.
 
typedef struct _element XML;
typedef struct _attr ATTR;
typedef struct _list ELEMENTLIST;

struct _element {
   char * name;
   ATTR * attrs;
   XML * parent;
   ELEMENTLIST * children;
};

struct _attr {
   char * name;
   char * value;
   ATTR * next;
};

struct _list {
   XML * element;
   ELEMENTLIST * next;
   ELEMENTLIST * prev;
};
Those datastructures suffice to represent an XML file in memory, semi-efficiently (although hash lookup for element and attribute names will be a nice feature at some later date) and completely. To work with those, I'm building up quite a little menagerie of functions, which are listed (mostly) here and defined further down on this page. None is particularly complex. The only function not on this page is the xml_read() function, which uses expat to parse an XML file. That's on the next page because that makes it more convenient to document the handlers used.

So here's my budding XML manipulation API:
 
See xml_create: Creating an empty element
See Working with attributes: xml_set and xml_attrval
See xml_prepend: Inserting elements
See xml_append: Inserting elements
See inserting things: xml_insertbefore and xml_insertafter
See xml_createtext: a shortcut for plain text
See xml_write: Writing XML data to disk
See xml_free: Cleaning up afterwards
See Deleting pieces: xml_delete
See Children: xml_first and xml_last
See Siblings: xml_next and xml_prev
See Bookmarking things: xml_loc and xml_getloc


xml_write: Writing XML data to disk
Writing the contents of one of our XML structures out into a file is simple. We've got two different variants on this function; one writes the entire element (xml_write) and the other writes just the content of the element (xml_writecontent).
 
void xml_writecontent (FILE * file, XML * xml);
void xml_write (FILE * file, XML * xml)
{
   ATTR * attr;
   ELEMENTLIST * list;
First, if the element we're working on is plain text, we just write it out.
 
   if (xml->name == NULL) {
      fprintf (file, "%s", xml->attrs->value);
      return;
   }
It's a regular element, so we open the element and write the name.
 
   fprintf (file, "<%s", xml->name);
   attr = xml->attrs;
   while (attr != NULL) {
      fprintf (file, " %s=\"%s\"", attr->name, attr->value);
      attr = attr->next;
   }
If the element has no children (this includes text), then we close the tag as an empty tag, and we're finished.
 
   if (xml->children == NULL) {
      fprintf (file, "/>");
      return;
   } else  fprintf (file, ">");
Otherwise we track down the list of children and write each of them, recursively.
 
   xml_writecontent (file, xml);
And finally, if there were children, then we need to close the tag with the full close.
 
   fprintf (file, "</%s>", xml->name);
}
The weakness of this function currently is that in the absence of plain text there will never be a line break. Not good -- but I don't see a good algorithm for doing it better while ruling out the possibility of inserting line breaks where they'll be errors.

Let's go ahead and define our xml_writecontent.
 
void xml_writecontent (FILE * file, XML * xml)
{
   ELEMENTLIST * list;

   list = xml->children;
   while (list) {
      xml_write (file, list->element);
      list = list->next;
   }
}


xml_prepend: Inserting elements
Prepending to a linked list is, of course, very easy.
 
void xml_prepend (XML * parent, XML * child)
{
   ELEMENTLIST * list;

   child->parent = parent;

   list = (ELEMENTLIST *) malloc (sizeof(struct _list));
   list->element = child;
   list->prev = NULL;
   list->next = parent->children;
   parent->children = list;
}


xml_append: Inserting elements
It's appending where we run into problems.
 
void xml_append (XML * parent, XML * child)
{
   ELEMENTLIST * list;
   ELEMENTLIST * ch;

   child->parent = parent;

   list = (ELEMENTLIST *) malloc (sizeof(struct _list));
   list->element = child;
   list->prev = NULL;
   list->next = NULL;

   if (parent->children == NULL) {
      parent->children = list;
      return;
   }

   ch = parent->children;
   while (ch->next != NULL) ch = ch->next;
   list->prev = ch;
   ch->next = list;
}


Bookmarking things: xml_loc and xml_getloc
 
XML * xml_loc (XML * start, const char * loc)
{
   char * mark;
   const char * attrval;
   char piece[64];
   int i;
   int count;

   if (!loc) return (start);
   if (!*loc) return (start);

   while (start && start->name == NULL) start = xml_next (start);
   if (!start) return (NULL);

   while (*loc == ' ') loc++;
   i = 0;
   while (*loc && *loc != '.') piece[i++] = *loc++;
   piece[i] = '\0';
   if (*loc) loc++;
   while (*loc == ' ') loc++;

   mark = strchr (piece, ']');
   if (mark) *mark = '\0';
   mark = strchr (piece, '(');
   if (mark) {
      *mark++ = '\0';
      count = atoi (mark);
      mark = NULL;
   } else {
      count = 0;
      mark = strchr (piece, '[');
      if (mark) {
         *mark++ = '\0';
      }
   }

   while (start) {
      if (start->name == NULL) {
         start = xml_next (start);
         continue;
      }
      if (strcmp (start->name, piece)) {
         start = xml_next (start);
         continue;
      }
      if (count) {
         count --;
         start = xml_next (start);
         continue;
      }
      if (!mark) {
         if (*loc) return (xml_loc (xml_first (start), loc));
         return (start);
      }
      attrval = xml_attrval(start, "id");
      if (attrval) {
         if (strcmp (attrval, mark)) {
            start = xml_next (start);
            continue;
         }
         if (*loc) return (xml_loc (xml_first(start), loc));
         return (start);
      }
      attrval = xml_attrval(start, "name");
      if (attrval) {
         if (strcmp (attrval, mark)) {
            start = xml_next (start);
            continue;
         }
         if (*loc) return (xml_loc (xml_first(start), loc));
         return (start);
      }
   }
   return (NULL);
}
Building our locator is recursive. We build our parent's locator, append a dot, and qualify it.
 
void xml_getloc (XML * xml, char *loc, int len)
{
   int s;
   int count;
   XML * sib;
   if (xml->parent != NULL) {
      xml_getloc (xml->parent, loc, len);
   } else {
      *loc = '\0';
   }
   s = strlen (loc);
   if (s > 0 && s < len-1) { strcat (loc, "."); s++; }
   len -= s;
   loc += s;
   if (strlen(xml->name) < len) {
      strcpy (loc, xml->name);
   } else {
      strncpy (loc, xml->name, len-1);
      loc[len-1] = '\0';
   }
   if (xml->parent == NULL) return;
   sib = xml_first(xml->parent);
   count = 0;
   while (sib != xml && sib != NULL) {
      if (sib->name != NULL) {
         if (!strcmp (sib->name, xml->name)) count ++;
      }
      sib = xml_next(sib);
   }
   if (count > 0 && s > 4) {
      strcat (loc, "(");
      sprintf (loc + strlen(loc), "%d", count);
      strcat (loc, ")");
   }
}


Working with attributes: xml_set and xml_attrval
Setting an attribute is a little complicated. If the attribute is already represented in the element's attribute list, then we free the old value, allocate space for a copy of the new value, and copy it. Otherwise we allocate a new attribute holder and copy both name and value into it.
 
void xml_set (XML * xml, const char * name, const char * value)
{
   ATTR * attr;

   attr = xml->attrs;
   while (attr) {
      if (!strcmp (attr->name, name)) break;
      attr = attr->next;
   }

   if (attr) {
      free ((void *) (attr->value));
      attr->value = (char *) malloc (strlen (value) + 1);
      strcpy (attr->value, value);
      return;
   }

   if (xml->attrs == NULL) {
      attr = (ATTR *) malloc (sizeof (struct _attr));
      xml->attrs = attr;
   } else {
      attr = xml->attrs;
      while (attr->next) attr = attr->next;
      attr->next = (ATTR *) malloc (sizeof (struct _attr));
      attr = attr->next;
   }

   attr->next = NULL;
   attr->name = (char *) malloc (strlen (name) + 1);
   strcpy (attr->name, name);
   attr->value = (char *) malloc (strlen (value) + 1);
   strcpy (attr->value, value);
}
void xml_setnum (XML * xml, const char *attr, int number)
{
   char buf[sizeof(number) * 3 + 1];
   sprintf (buf, "%d", number);
   xml_set (xml, attr, buf);
}
Retriving a value, on the other hand, is rather simple.
 
const char * xml_attrval (XML * element,const char * name)
{
   ATTR * attr;

   attr = element->attrs;
   while (attr) {
      if (!strcmp (attr->name, name)) return (attr->value);
      attr = attr->next;
   }
   return ("");
}
int xml_attrvalnum (XML * element, const char * name)
{
   return (atoi (xml_attrval (element, name)));
}


xml_create: Creating an empty element
Creation of an element is nothing more than allocating the space, then allocating space for a copy of the name and copying it.
 
XML * xml_create (const char * name)
{
   XML * ret;

   ret = (XML *) malloc (sizeof (struct _element));
   ret->name = (char *) malloc (strlen (name) + 1);
   strcpy (ret->name, name);
   ret->attrs = NULL;
   ret->children = NULL;

   return (ret);
}


xml_createtext: a shortcut for plain text
I represent character data (plain old text) as an element with no name. The first (nameless) attribute contains the text. Instead of using xml_create to do this, then using xml_set to set the attribute, I'm defining a special create function for plain text chunks.

And for easy compatibility with expat, there's a version which takes a pointer and length instead of assuming null termination.
 
XML * xml_createtext (const char * value)
{
   XML * ret;

   ret = (XML *) malloc (sizeof (struct _element));
   ret->name = NULL;
   ret->children = NULL;
   ret->attrs = (ATTR *) malloc (sizeof (struct _attr));
   ret->attrs->name = NULL;
   ret->attrs->next = NULL;
   ret->attrs->value = (char *) malloc (strlen (value) + 1);
   strcpy (ret->attrs->value, value);

   return (ret);
}
XML * xml_createtextlen (const char * value, int len)
{
   XML * ret;

   ret = (XML *) malloc (sizeof (struct _element));
   ret->name = NULL;
   ret->children = NULL;
   ret->attrs = (ATTR *) malloc (sizeof (struct _attr));
   ret->attrs->name = NULL;
   ret->attrs->next = NULL;
   ret->attrs->value = (char *) malloc (len + 1);
   strncpy (ret->attrs->value, value, len);
   ret->attrs->value[len] = '\0';

   return (ret);
}


xml_free: Cleaning up afterwards
To free an XML element, we free its name, each of its attributes (and their names and values), each child (recursively) and the list element which held the child, and finally we can free the XML element itself.
 
void xml_free (XML * xml)
{
   ATTR * attr;
   ELEMENTLIST * list;

   if (xml == NULL) return;

   if (xml->name != NULL) free ((void *) (xml->name));
   while (xml->attrs) {
      attr = xml->attrs;
      xml->attrs = xml->attrs->next;
      if (attr->name != NULL) free ((void *) (attr->name));
      if (attr->value != NULL) free ((void *) (attr->value));
      xml->attrs = attr->next;
      free ((void *) attr);
   }

   while (xml->children) {
      list = xml->children;
      xml->children = list->next;
      if (list->element != NULL) xml_free (list->element);
      free ((void *) list);
   }
   free ((void *) xml);
}


Deleting pieces: xml_delete
Deleting a piece out of an XML structure is more than just freeing it; we have to close ranks before and after as well.
 
void xml_delete(XML * piece)
{
   ELEMENTLIST * list;
   if (!piece) return;
   if (piece->parent != NULL) {
      list = piece->parent->children;
      while (list != NULL && list->element != piece) list = list->next;
      if (list != NULL) {
         if (list->next != NULL) list->next->prev = list->prev;
         if (list->prev != NULL) list->prev->next = list->next;
      }
      if (list == piece->parent->children) piece->parent->children = list->next;
      free ((void *) list);
   }
   xml_free (piece);
}


Children: xml_first and xml_last
Finding the first child is, of course, very easy. The last is less so.

I've also tossed in a function xml_firstelem which is just lie xml_first except that it doesn't see plain text elements.
 
XML * xml_first(XML * xml)
{
   if (xml == NULL) return NULL;
   if (xml->children == NULL) return NULL;
   return (xml->children->element);
}
XML * xml_firstelem(XML * xml)
{
   ELEMENTLIST *list;
   if (xml == NULL) return NULL;
   list = xml->children;
   while (list != NULL) {
      if (list->element->name != NULL) break;
      list = list->next;
   }
   if (list != NULL) return (list->element);
   return NULL;
}

XML * xml_last(XML *xml)
{
   ELEMENTLIST *list;
   list = xml->children;
   if (list == NULL) return NULL;
   while (list->next != NULL) list = list->next;
   return (list->element);
}


Siblings: xml_next and xml_prev
For next and previous, we have to find the current element in its parent's children list, and then we're good to go. Each function comes in two flavors: one sees plain text and the other (e.g. xml_nextelem) doesn't.
 
XML * xml_next(XML * xml)
{
   ELEMENTLIST *list;
   if (xml == NULL) return (NULL);
   if (xml->parent == NULL) return (NULL);
   list = xml->parent->children;
   while (list != NULL && list->element != xml) list = list->next;
   if (list == NULL) return (NULL);
   if (list->next == NULL) return (NULL);
   return (list->next->element);
}
XML * xml_nextelem(XML * xml)
{
   ELEMENTLIST *list;
   if (xml == NULL) return (NULL);
   if (xml->parent == NULL) return (NULL);
   list = xml->parent->children;
   while (list != NULL && list->element != xml) list = list->next;
   if (list == NULL) return (NULL);
   while (list->next != NULL) {
      if (list->next->element->name != NULL) break;
      list = list->next;
   }
   if (list->next == NULL) return (NULL);
   return (list->next->element);
}
XML * xml_prev(XML * xml)
{
   ELEMENTLIST *list;
   if (xml == NULL) return (NULL);
   if (xml->parent == NULL) return (NULL);
   list = xml->parent->children;
   while (list != NULL && list->element != xml) list = list->next;
   if (list == NULL) return (NULL);
   if (list->prev == NULL) return (NULL);
   return (list->prev->element);
}
XML * xml_prevelem(XML * xml)
{
   ELEMENTLIST *list;
   if (xml == NULL) return (NULL);
   if (xml->parent == NULL) return (NULL);
   list = xml->parent->children;
   while (list != NULL && list->element != xml) list = list->next;
   if (list == NULL) return (NULL);
   while (list->prev != NULL) {
      if (list->prev->element->name != NULL) break;
      list = list->prev;
   }
   if (list->prev == NULL) return (NULL);
   return (list->prev->element);
}


inserting things: xml_insertbefore and xml_insertafter
I don't need this just at the moment, so I'll skip it for now.
Previous: Overview of the program and main() ] [ Top: wftk core index ] [ Next: xml_read: Using expat to parse XML files into memory ]


This code and documentation are released under the terms of the GNU license. They are additionally copyright (c) 2000, Vivtek. All rights reserved except those explicitly granted under the terms of the GNU license.