getElementsBySelector() - CSS Query Selector for HTML DOM

getElementsBySelector is a python function which takes a standard CSS style selector and returns an array of elements objects from the document that match that selector. This is a frequently used function in JavaScript - if you use a library. Its kind of meaningless to have this function in the server side - unless you are doing screen-scarping. Then its very useful. Recently, I had to work on a Django app that does a bit of screen-scrapping - so I created this function to aid me.

This is a direct port of my JavaScript getElementsBySelector function. I'll be creating a similar function for PHP soon. Also for Ruby if time permits.

The Code


#Get DOM elements based on the given CSS Selector - V 1.00.A Beta
#Direct port of http://www.openjs.com/scripts/dom/css_selector/
def getElementsBySelector(all_selectors, document):
	selected = []
	import re, string
	
	all_selectors = re.sub(r'\s*([^\w])\s*',r'\1', all_selectors) #Remove the 'beautification' spaces
	
	# Grab all of the tagName elements within current context	
	def getElements(context,tag):
		if (tag == ""): tag = '*'
		
		# Get elements matching tag, filter them for class selector
		found = []
		for con in context:
			eles = con.getElementsByTagName(tag)
			found.extend(eles)
		
		return found

	context = [document]
	inheriters = string.split(all_selectors, " ")

	# Space
	for element in inheriters:
		#This part is to make sure that it is not part of a CSS3 Selector
		left_bracket = string.find(element,"[")
		right_bracket = string.find(element,"]")
		pos = string.find(element,"#") #ID
		
		if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
			parts = string.split(element, "#")
			tag = parts[0]
			id = parts[1]
			ele = document.getElementById(id)
			
			context = [](ele)
			continue
		

		pos = string.find(element,".")#Class
		if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
			parts = string.split(element, '.')
			tag = parts[0]
			class_name = parts[1]

			found = getElements(context, tag)
			context = []
			for fnd in found:
				if(fnd.getAttribute("class") and re.search(r'(^|\s)'+class_name+'(\s|$)', fnd.getAttribute("class"))): context.append(fnd)
			
			continue
		

		if(string.find(element,'[')+1):#If the char '[' appears, that means it needs CSS 3 parsing
			# Code to deal with attribute selectors
			m = re.match(r'^(\w*)\[(\w+)([=~\|\^\$\*]?)=?[\'"]?([^\]\'"]*)[\'"]?\]$', element)
			if (m):
				tag = m.group(1)
				attr = m.group(2)
				operator = m.group(3)
				value = m.group(4)
			
			found = getElements(context,tag)
			context = []
			for fnd in found:
				if(operator=='=' and fnd.getAttribute(attr) != value): continue
				if(operator=='~' and not(re.search(r'(^|\\s)'+value+'(\\s|$)',  fnd.getAttribute(attr)))): continue
				if(operator=='|' and not(re.search(r'^'+value+'-?', fnd.getAttribute(attr)))): continue
				if(operator=='^' and string.find(fnd.getAttribute(attr), value)!=0): continue
				if(operator=='$' and string.rfind(fnd.getAttribute(attr), value) != (fnd.getAttribute(attr).length-value.length)): continue
				if(operator=='*' and not(string.find(fnd.getAttribute(attr), value)+1)): continue
				
				elif(not fnd.getAttribute(attr)): continue
				context.append(fnd)

			continue
		
		#Tag selectors - no class or id specified.
		found = getElements(context,element)
		context = found
	
	selected.extend(context)
	return selected

Sample Usage


import urllib2
from xml.dom.minidom import parseString

html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
dom = parseString(html)

links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")

This should work. Except that it doesn't. Not my fault - the XML parser will only parse valid XML documents - and Twitter's search page is not a valid XML file. So we have to run the code through HTMLTidy. To do that, tidy must be installed. Once that is installed, use the code...


import urllib2
import tidy

html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
html = str(tidy.parseString(html, output_xhtml=1))
dom = parseString(html)

links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")

Other XML/HTML Parsers in Python

Beta Release

This is a beta release - so expect bugs.

License

BSD License

Comments

Henning at 15 Oct, 2009 12:49
Cool, thanks for porting this code to Python.
I tried to convert the code to use Python strings methods. While working on it, I found a small error:
The line
context = [](ele)
should probably be
context = [ele]
Reply to this.
Emil Stenström at 28 Nov, 2009 06:03
Thanks for your hard work!

Any plans to expand what it can handle? I'm parsing link tags that point to an RSS, and would need link[rel="alternate"][type] (multiple attribute selectors, which is valid CSS), but it seems the above chokes on that one. Let me know if you fix it!
Reply to this.
Emil Stenström at 28 Nov, 2009 09:21
Bah. I should do some work myself. Here's a patched version that works with multiple attribute selectors:

From line 57:
        if(string.find(element,'[')+1):#If the char '[' appears, that means it needs CSS 3 parsing
            # Code to deal with attribute selectors
            parts = element.split('[')
            tag = parts[0]
            attrs_sel = ["[%s" % attr_sel for attr_sel in parts[1:]]
            print tag, attrs_sel

            found = getElements(context,tag)

            context = []
            attribute_re = r'[(w+)([=~|^$*]?)=?['"]?([^]'"]*)['"]?]'
            for fnd in found:
                print "Potential:", fnd
                passes = 0
                for attr_sel in attrs_sel:
                    m = re.match(attribute_re, attr_sel)
                    attr = m.group(1)
                    operator = m.group(2)
                    value = m.group(3)

                    if(operator=='=' and fnd.getAttribute(attr) != value): continue
                    if(operator=='~' and not(re.search(r'(^|s)'+value+'(s|$)',  fnd.getAttribute(attr)))): continue
                    if(operator=='|' and not(re.search(r'^'+value+'-?', fnd.getAttribute(attr)))): continue
                    if(operator=='^' and string.find(fnd.getAttribute(attr), value)!=0): continue
                    if(operator=='$' and string.rfind(fnd.getAttribute(attr), value) != (fnd.getAttribute(attr).length-value.length)): continue
                    if(operator=='*' and not(string.find(fnd.getAttribute(attr), value)+1)): continue
                    if(operator=='' and not fnd.getAttribute(attr)): continue
                    print "Pass", attr_sel
                    passes += 1

                if passes == len(attrs_sel):
                    context.append(fnd)
            continue
Reply to this.
Emil Stenström at 28 Nov, 2009 09:22
Yeah, please remove the print statements :)
Reply to this.
Anonymous at 30 Jan, 2010 05:44
document.querySelector("tagname");
document.querySelector("#ID");
document.querySelector(".class");

Al ready a function!!!!
Reply to this.
Comment

Please dont enter you comments in this form - this is a fake form to confuse spamming bots. The next form is the real one.




Comment




Comment Formating : HTML tags a, strong, em, b, i, code, pre, p and br allowed. Other tags will be shown as code(< will become &lt;). Urls, Line breaks will be auto-formated.
Subscribe to Feed