getElementsBySelector() - CSS Query Selector for HTML DOM
getElementsBySelector is a python function which takes a standard CSS style selector and returns an array of elements objects from the document that match that selector. This is a frequently used function in JavaScript - if you use a library. Its kind of meaningless to have this function in the server side - unless you are doing screen-scarping. Then its very useful. Recently, I had to work on a Django app that does a bit of screen-scrapping - so I created this function to aid me.
This is a direct port of my JavaScript getElementsBySelector function. I'll be creating a similar function for PHP soon. Also for Ruby if time permits.
The Code
#Get DOM elements based on the given CSS Selector - V 1.00.A Beta
#Direct port of http://www.openjs.com/scripts/dom/css_selector/
def getElementsBySelector(all_selectors, document):
selected = []
import re, string
all_selectors = re.sub(r'\s*([^\w])\s*',r'\1', all_selectors) #Remove the 'beautification' spaces
# Grab all of the tagName elements within current context
def getElements(context,tag):
if (tag == ""): tag = '*'
# Get elements matching tag, filter them for class selector
found = []
for con in context:
eles = con.getElementsByTagName(tag)
found.extend(eles)
return found
context = [document]
inheriters = string.split(all_selectors, " ")
# Space
for element in inheriters:
#This part is to make sure that it is not part of a CSS3 Selector
left_bracket = string.find(element,"[")
right_bracket = string.find(element,"]")
pos = string.find(element,"#") #ID
if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
parts = string.split(element, "#")
tag = parts[0]
id = parts[1]
ele = document.getElementById(id)
context = [](ele)
continue
pos = string.find(element,".")#Class
if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
parts = string.split(element, '.')
tag = parts[0]
class_name = parts[1]
found = getElements(context, tag)
context = []
for fnd in found:
if(fnd.getAttribute("class") and re.search(r'(^|\s)'+class_name+'(\s|$)', fnd.getAttribute("class"))): context.append(fnd)
continue
if(string.find(element,'[')+1):#If the char '[' appears, that means it needs CSS 3 parsing
# Code to deal with attribute selectors
m = re.match(r'^(\w*)\[(\w+)([=~\|\^\$\*]?)=?[\'"]?([^\]\'"]*)[\'"]?\]$', element)
if (m):
tag = m.group(1)
attr = m.group(2)
operator = m.group(3)
value = m.group(4)
found = getElements(context,tag)
context = []
for fnd in found:
if(operator=='=' and fnd.getAttribute(attr) != value): continue
if(operator=='~' and not(re.search(r'(^|\\s)'+value+'(\\s|$)', fnd.getAttribute(attr)))): continue
if(operator=='|' and not(re.search(r'^'+value+'-?', fnd.getAttribute(attr)))): continue
if(operator=='^' and string.find(fnd.getAttribute(attr), value)!=0): continue
if(operator=='$' and string.rfind(fnd.getAttribute(attr), value) != (fnd.getAttribute(attr).length-value.length)): continue
if(operator=='*' and not(string.find(fnd.getAttribute(attr), value)+1)): continue
elif(not fnd.getAttribute(attr)): continue
context.append(fnd)
continue
#Tag selectors - no class or id specified.
found = getElements(context,element)
context = found
selected.extend(context)
return selected
Sample Usage
import urllib2
from xml.dom.minidom import parseString
html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
dom = parseString(html)
links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")
This should work. Except that it doesn't. Not my fault - the XML parser will only parse valid XML documents - and Twitter's search page is not a valid XML file. So we have to run the code through HTMLTidy. To do that, tidy must be installed. Once that is installed, use the code...
import urllib2
import tidy
html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
html = str(tidy.parseString(html, output_xhtml=1))
dom = parseString(html)
links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")
Other XML/HTML Parsers in Python
Beta Release
This is a beta release - so expect bugs.

Comments
I tried to convert the code to use Python strings methods. While working on it, I found a small error:
The line
context = [](ele)
should probably be
context = [ele]
Any plans to expand what it can handle? I'm parsing link tags that point to an RSS, and would need link[rel="alternate"][type] (multiple attribute selectors, which is valid CSS), but it seems the above chokes on that one. Let me know if you fix it!
From line 57:
if(string.find(element,'[')+1):#If the char '[' appears, that means it needs CSS 3 parsing # Code to deal with attribute selectors parts = element.split('[') tag = parts[0] attrs_sel = ["[%s" % attr_sel for attr_sel in parts[1:]] print tag, attrs_sel found = getElements(context,tag) context = [] attribute_re = r'[(w+)([=~|^$*]?)=?['"]?([^]'"]*)['"]?]' for fnd in found: print "Potential:", fnd passes = 0 for attr_sel in attrs_sel: m = re.match(attribute_re, attr_sel) attr = m.group(1) operator = m.group(2) value = m.group(3) if(operator=='=' and fnd.getAttribute(attr) != value): continue if(operator=='~' and not(re.search(r'(^|s)'+value+'(s|$)', fnd.getAttribute(attr)))): continue if(operator=='|' and not(re.search(r'^'+value+'-?', fnd.getAttribute(attr)))): continue if(operator=='^' and string.find(fnd.getAttribute(attr), value)!=0): continue if(operator=='$' and string.rfind(fnd.getAttribute(attr), value) != (fnd.getAttribute(attr).length-value.length)): continue if(operator=='*' and not(string.find(fnd.getAttribute(attr), value)+1)): continue if(operator=='' and not fnd.getAttribute(attr)): continue print "Pass", attr_sel passes += 1 if passes == len(attrs_sel): context.append(fnd) continuedocument.querySelector("#ID");
document.querySelector(".class");
Al ready a function!!!!
a, strong, em, b, i, code, pre, pandbrallowed. Other tags will be shown as code(< will become <). Urls, Line breaks will be auto-formated.