diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 08:02:06 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 08:02:06 -0400 |
commit | 835a38cf6d4ad8dcd75105a407ade350fbe77b8b (patch) | |
tree | 6eb0b587715594a066d9cd43f13eff9d098157f2 | |
parent | 089309b0bdd2b2f441f42c8143d9c9ab11abc4ad (diff) | |
download | beautifulsoup4-835a38cf6d4ad8dcd75105a407ade350fbe77b8b.tar.gz |
Let's get some profiling going.
-rw-r--r-- | bs4/__init__.py | 22 | ||||
-rw-r--r-- | bs4/diagnose.py | 23 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 1 |
3 files changed, 33 insertions, 13 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 0dded3a..4e268ef 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -208,7 +208,7 @@ class BeautifulSoup(Tag): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() - self.currentData = [] + self.current_data = [] self.currentTag = None self.tagStack = [] self.pushTag(self) @@ -244,21 +244,21 @@ class BeautifulSoup(Tag): self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + if self.current_data: + current_data = u''.join(self.current_data) + if (current_data.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.builder.preserve_whitespace_tags)): - if '\n' in currentData: - currentData = '\n' + if '\n' in current_data: + current_data = '\n' else: - currentData = ' ' - self.currentData = [] + current_data = ' ' + self.current_data = [] if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ - not self.parse_only.search(currentData)): + not self.parse_only.search(current_data)): return - o = containerClass(currentData) + o = containerClass(current_data) self.object_was_parsed(o) def object_was_parsed(self, o, parent=None, most_recent_element=None): @@ -328,7 +328,7 @@ class BeautifulSoup(Tag): self._popToTag(name, nsprefix) def handle_data(self, data): - self.currentData.append(data) + self.current_data.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, diff --git a/bs4/diagnose.py b/bs4/diagnose.py index f9bff28..b6eaa8d 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -1,10 +1,15 @@ """Diagnostic functions, mainly for use when doing tech support.""" +import cProfile from StringIO import StringIO from HTMLParser import HTMLParser +import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry + import os +import pstats import random +import tempfile import time import traceback import sys @@ -174,5 +179,21 @@ def benchmark_parsers(num_elements=100000): b = time.time() print "Raw lxml parsed the markup in %.2fs." % (b-a) +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + stats.strip_dirs() + cumulative = stats.sort_stats("cumulative") + total = stats.sort_stats("time") + import pdb; pdb.set_trace() + if __name__ == '__main__': - diagnose(sys.stdin.read()) + #diagnose(sys.stdin.read()) + profile() diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 6219b89..910b37e 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -96,7 +96,6 @@ class TestWarnings(SoupTest): soup = self.soup("http://www.crummy.com/ is great") self.assertEqual(0, len(w)) - class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): |