/* * Copyright (c) 2008 - 2009 * Jonathan Schleifer <js@webkeks.org> * * All rights reserved. * * This file is part of libobjfw. It may be distributed under the terms of the * Q Public License 1.0, which can be found in the file LICENSE included in * the packaging of this file. */ #include "config.h" #include <string.h> #import "OFXMLParser.h" #import "OFAutoreleasePool.h" #import "OFExceptions.h" #import "OFMacros.h" int _OFXMLParser_reference; static OF_INLINE OFString* transform_string(OFString *cache, OFObject <OFXMLUnescapingDelegate> *handler) { /* TODO: Support for xml:space */ [cache removeLeadingAndTrailingWhitespaces]; return [cache stringByXMLUnescapingWithHandler: handler]; } static OF_INLINE OFString* parse_numeric_entity(const char *entity, size_t length) { of_unichar_t c; size_t i; char buf[5]; if (length == 1 || *entity != '#') return nil; c = 0; entity++; length--; if (entity[0] == 'x') { if (length == 1) return nil; entity++; length--; for (i = 0; i < length; i++) { if (entity[i] >= '0' && entity[i] <= '9') c = (c << 4) + (entity[i] - '0'); else if (entity[i] >= 'A' && entity[i] <= 'F') c = (c << 4) + (entity[i] - 'A' + 10); else if (entity[i] >= 'a' && entity[i] <= 'f') c = (c << 4) + (entity[i] - 'a' + 10); else return nil; } } else { for (i = 0; i < length; i++) { if (entity[i] >= '0' && entity[i] <= '9') c = (c * 10) + (entity[i] - '0'); else return nil; } } if ((i = of_string_unicode_to_utf8(c, buf)) == 0) return nil; buf[i] = 0; return [OFString stringWithCString: buf length: i]; } @implementation OFXMLParser + xmlParser { return [[[self alloc] init] autorelease]; } - init { self = [super init]; @try { cache = [[OFMutableString alloc] init]; previous = [[OFMutableArray alloc] init]; } @catch (OFException *e) { /* We can't use [super dealloc] on OS X here. Compiler bug? */ [self dealloc]; @throw e; } return self; } - (void)dealloc { [delegate release]; [cache release]; [name release]; [prefix release]; [ns release]; [attrs release]; [attr_name release]; [attr_prefix release]; [previous release]; [super dealloc]; } - (id)delegate { return [[delegate retain] autorelease]; } - setDelegate: (OFObject <OFXMLParserDelegate>*)delegate_ { [delegate_ retain]; [delegate release]; delegate = delegate_; return self; } - parseBuffer: (const char*)buf withSize: (size_t)size { OFAutoreleasePool *pool; size_t i, last, len; last = 0; for (i = 0; i < size; i++) { switch (state) { /* Not in a tag */ case OF_XMLPARSER_OUTSIDE_TAG: if (buf[i] == '<') { len = i - last; if (len > 0) [cache appendCString: buf + last withLength: len]; if ([cache cStringLength] > 0) { OFString *str; pool = [[OFAutoreleasePool alloc] init]; str = transform_string(cache, self); [delegate xmlParser: self foundString: str]; [pool release]; } [cache setToCString: ""]; last = i + 1; state = OF_XMLPARSER_TAG_OPENED; } break; /* Tag was just opened */ case OF_XMLPARSER_TAG_OPENED: if (buf[i] == '/') { last = i + 1; state = OF_XMLPARSER_IN_CLOSE_TAG_NAME; } else if(buf[i] == '!') { last = i + 1; state = OF_XMLPARSER_IN_COMMENT_1; } else { state = OF_XMLPARSER_IN_TAG_NAME; i--; } break; /* Inside a tag, no name yet */ case OF_XMLPARSER_IN_TAG_NAME: if (buf[i] == ' ' || buf[i] == '>' || buf[i] == '/') { const char *cache_c, *tmp; size_t cache_len; len = i - last; if (len > 0) [cache appendCString: buf + last withLength: len]; cache_c = [cache cString]; cache_len = [cache cStringLength]; if ((tmp = memchr(cache_c, ':', cache_len)) != NULL) { name = [[OFString alloc] initWithCString: tmp + 1 length: cache_len - (tmp - cache_c) - 1]; prefix = [[OFString alloc] initWithCString: cache_c length: tmp - cache_c]; } else { name = [cache copy]; prefix = nil; } if (buf[i] == '>' || buf[i] == '/') { pool = [[OFAutoreleasePool alloc] init]; [delegate xmlParser: self didStartTagWithName: name prefix: prefix namespace: ns attributes: nil]; if (buf[i] == '/') [delegate xmlParser: self didEndTagWithName: name prefix: prefix namespace: ns]; else [previous addObject: [[cache copy] autorelease]]; [pool release]; [name release]; [prefix release]; [ns release]; name = prefix = ns = nil; state = (buf[i] == '/' ? OF_XMLPARSER_EXPECT_CLOSE : OF_XMLPARSER_OUTSIDE_TAG); } else state = OF_XMLPARSER_IN_TAG; [cache setToCString: ""]; last = i + 1; } break; /* Inside a close tag, no name yet */ case OF_XMLPARSER_IN_CLOSE_TAG_NAME: if (buf[i] == ' ' || buf[i] == '>') { const char *cache_c, *tmp; size_t cache_len; len = i - last; if (len > 0) [cache appendCString: buf + last withLength: len]; cache_c = [cache cString]; cache_len = [cache cStringLength]; if ((tmp = memchr(cache_c, ':', cache_len)) != NULL) { name = [[OFString alloc] initWithCString: tmp + 1 length: cache_len - (tmp - cache_c) - 1]; prefix = [[OFString alloc] initWithCString: cache_c length: tmp - cache_c]; } else { name = [cache copy]; prefix = nil; } if (![[previous lastObject] isEqual: cache]) @throw [OFMalformedXMLException newWithClass: isa]; [previous removeNObjects: 1]; [cache setToCString: ""]; pool = [[OFAutoreleasePool alloc] init]; [delegate xmlParser: self didEndTagWithName: name prefix: prefix namespace: ns]; [pool release]; [name release]; [prefix release]; [ns release]; name = prefix = ns = nil; last = i + 1; state = (buf[i] == ' ' ? OF_XMLPARSER_EXPECT_SPACE_OR_CLOSE : OF_XMLPARSER_OUTSIDE_TAG); } break; /* Inside a tag, name found */ case OF_XMLPARSER_IN_TAG: if (buf[i] == '>' || buf[i] == '/') { pool = [[OFAutoreleasePool alloc] init]; [delegate xmlParser: self didStartTagWithName: name prefix: prefix namespace: ns attributes: attrs]; if (buf[i] == '/') [delegate xmlParser: self didEndTagWithName: name prefix: prefix namespace: ns]; else if (prefix != nil) { OFString *str = [OFString stringWithFormat: @"%s:%s", [prefix cString], [name cString]]; [previous addObject: str]; } else [previous addObject: name]; [pool release]; [name release]; [prefix release]; [ns release]; [attrs release]; name = prefix = ns = nil; attrs = nil; last = i + 1; state = (buf[i] == '/' ? OF_XMLPARSER_EXPECT_CLOSE : OF_XMLPARSER_OUTSIDE_TAG); } else if (buf[i] != ' ') { last = i; state = OF_XMLPARSER_IN_ATTR_NAME; i--; } break; /* Looking for attribute name */ case OF_XMLPARSER_IN_ATTR_NAME: if (buf[i] == '=') { const char *cache_c, *tmp; size_t cache_len; len = i - last; if (len > 0) [cache appendCString: buf + last withLength: len]; cache_c = [cache cString]; cache_len = [cache cStringLength]; if ((tmp = memchr(cache_c, ':', cache_len)) != NULL ) { attr_name = [[OFString alloc] initWithCString: tmp + 1 length: cache_len - (tmp - cache_c) - 1]; attr_prefix = [[OFString alloc] initWithCString: cache_c length: tmp - cache_c]; } else { attr_name = [cache copy]; attr_prefix = nil; } [cache setToCString: ""]; last = i + 1; state = OF_XMLPARSER_EXPECT_DELIM; } break; /* Expecting delimiter */ case OF_XMLPARSER_EXPECT_DELIM: if (buf[i] != '\'' && buf[i] != '"') @throw [OFMalformedXMLException newWithClass: isa]; delim = buf[i]; last = i + 1; state = OF_XMLPARSER_IN_ATTR_VALUE; break; /* Looking for attribute value */ case OF_XMLPARSER_IN_ATTR_VALUE: if (buf[i] == delim) { OFString *attr_val; len = i - last; if (len > 0) [cache appendCString: buf + last withLength: len]; if (attrs == nil) attrs = [[OFMutableArray alloc] init]; pool = [[OFAutoreleasePool alloc] init]; attr_val = [cache stringByXMLUnescapingWithHandler: self]; [attrs addObject: [OFXMLAttribute attributeWithName: attr_name prefix: attr_prefix namespace: nil stringValue: attr_val]]; [pool release]; [cache setToCString: ""]; [attr_name release]; [attr_prefix release]; attr_name = attr_prefix = nil; last = i + 1; state = OF_XMLPARSER_IN_TAG; } break; /* Expecting closing '>' */ case OF_XMLPARSER_EXPECT_CLOSE: if (buf[i] == '>') { last = i + 1; state = OF_XMLPARSER_OUTSIDE_TAG; } else @throw [OFMalformedXMLException newWithClass: isa]; break; /* Expecting closing '>' or space */ case OF_XMLPARSER_EXPECT_SPACE_OR_CLOSE: if (buf[i] == '>') { last = i + 1; state = OF_XMLPARSER_OUTSIDE_TAG; } else if (buf[i] != ' ') @throw [OFMalformedXMLException newWithClass: isa]; break; /* Comment */ case OF_XMLPARSER_IN_COMMENT_1: case OF_XMLPARSER_IN_COMMENT_2: if (buf[i] != '-') @throw [OFMalformedXMLException newWithClass: isa]; last = i + 1; state++; break; case OF_XMLPARSER_IN_COMMENT_3: if (buf[i] == '-') state = OF_XMLPARSER_IN_COMMENT_4; break; case OF_XMLPARSER_IN_COMMENT_4: if (buf[i] == '-') { size_t cache_len; [cache appendCString: buf + last withLength: i - last]; cache_len = [cache length]; pool = [[OFAutoreleasePool alloc] init]; [cache removeCharactersFromIndex: cache_len - 1 toIndex: cache_len]; [cache removeLeadingAndTrailingWhitespaces]; [delegate xmlParser: self foundComment: cache]; [pool release]; [cache setToCString: ""]; last = i + 1; state = OF_XMLPARSER_EXPECT_CLOSE; } else state = OF_XMLPARSER_IN_COMMENT_3; break; } } len = size - last; /* In OF_XMLPARSER_IN_TAG, there can be only spaces */ if (len > 0 && state != OF_XMLPARSER_IN_TAG) [cache appendCString: buf + last withLength: len]; return self; } - (OFString*)foundUnknownEntityNamed: (OFString*)entity { return [delegate xmlParser: self foundUnknownEntityNamed: entity]; } @end @implementation OFString (OFXMLUnescaping) - stringByXMLUnescaping { return [self stringByXMLUnescapingWithHandler: nil]; } - stringByXMLUnescapingWithHandler: (OFObject <OFXMLUnescapingDelegate>*)h { size_t i, last; BOOL in_entity; OFString *ret; last = 0; in_entity = NO; ret = [OFMutableString string]; ret->is_utf8 = is_utf8; for (i = 0; i < length; i++) { if (!in_entity && string[i] == '&') { [ret appendCStringWithoutUTF8Checking: string + last length: i - last]; last = i + 1; in_entity = YES; } else if (in_entity && string[i] == ';') { char *entity = string + last; size_t len = i - last; if (len == 2 && !memcmp(entity, "lt", 2)) [ret appendCStringWithoutUTF8Checking: "<" length: 1]; else if (len == 2 && !memcmp(entity, "gt", 2)) [ret appendCStringWithoutUTF8Checking: ">" length: 1]; else if (len == 4 && !memcmp(entity, "quot", 4)) [ret appendCStringWithoutUTF8Checking: "\"" length: 1]; else if (len == 4 && !memcmp(entity, "apos", 4)) [ret appendCStringWithoutUTF8Checking: "'" length: 1]; else if (len == 3 && !memcmp(entity, "amp", 3)) [ret appendCStringWithoutUTF8Checking: "&" length: 1]; else if (entity[0] == '#') { OFAutoreleasePool *pool; OFString *tmp; pool = [[OFAutoreleasePool alloc] init]; tmp = parse_numeric_entity(entity, len); if (tmp == nil) @throw [OFInvalidEncodingException newWithClass: isa]; [ret appendString: tmp]; [pool release]; } else if (h != nil) { OFAutoreleasePool *pool; OFString *n, *tmp; pool = [[OFAutoreleasePool alloc] init]; n = [OFString stringWithCString: entity length: len]; tmp = [h foundUnknownEntityNamed: n]; if (tmp == nil) @throw [OFInvalidEncodingException newWithClass: isa]; [ret appendString: tmp]; [pool release]; } else @throw [OFInvalidEncodingException newWithClass: isa]; last = i + 1; in_entity = NO; } } if (in_entity) @throw [OFInvalidEncodingException newWithClass: isa]; [ret appendCStringWithoutUTF8Checking: string + last length: i - last]; return ret; } @end @implementation OFObject (OFXMLParserDelegate) - (void)xmlParser: (OFXMLParser*)parser didStartTagWithName: (OFString*)name prefix: (OFString*)prefix namespace: (OFString*)ns attributes: (OFArray*)attrs { } - (void)xmlParser: (OFXMLParser*)parser didEndTagWithName: (OFString*)name prefix: (OFString*)prefix namespace: (OFString*)ns { } - (void)xmlParser: (OFXMLParser*)parser foundString: (OFString*)string { } - (void)xmlParser: (OFXMLParser*)parser foundComment: (OFString*)comment { } - (OFString*)xmlParser: (OFXMLParser*)parser foundUnknownEntityNamed: (OFString*)entity { return nil; } @end