While writing Daily Deviations I've searched for a parser to read HTML markup and extract info about deviations. None was suitable so I've written my own and now share it with you.
Here is a sample project with all sources: Images Extractor
Usage
HyperParser API is modeled after NSXMLParser API. Typically the first step is to create a delegate object that will collect required data from the stream of HTML tokens. In our example we extract src attribute of img elements:
@interface ImagesCollector : NSObject <HyperParserDelegate> {
NSMutableArray *urls;
}
- (NSArray *)urls;
@end
@implementation ImagesCollector
- (id)init {
if (self = [super init]) {
urls = [[NSMutableArray alloc] init];
}
return self;
}
- (void)dealloc {
[urls release];
[super dealloc];
}
- (void)parser:(HyperParser *)parser
didStartElement:(NSString *)elementName
attributes:(NSDictionary *)attributeDict
{
if ([elementName caseInsensitiveCompare:@"img"] == NSOrderedSame) {
NSString *imageLink = [attributeDict objectForKey:@"src"];
if (imageLink) {
[urls addObject:imageLink];
}
}
}
- (NSArray *)urls {
return urls;
}
@end
The second step is to actually parse a string:
NSString *page = [NSString stringWithContentsOfFile:@"page.html"];
NSLog(@"Page length: %i", [page length]);
ImagesCollector *collector = [[[ImagesCollector alloc] init] autorelease];
HyperParser *parser = [[[HyperParser alloc] initWithString:page] autorelease];
parser.delegate = collector;
[parser parse];
for (NSString *url in [collector urls]) {
NSLog(url);
}






