summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 08:02:06 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 08:02:06 -0400
commit835a38cf6d4ad8dcd75105a407ade350fbe77b8b (patch)
tree6eb0b587715594a066d9cd43f13eff9d098157f2
parent089309b0bdd2b2f441f42c8143d9c9ab11abc4ad (diff)
downloadbeautifulsoup4-835a38cf6d4ad8dcd75105a407ade350fbe77b8b.tar.gz
Let's get some profiling going.
-rw-r--r--bs4/__init__.py22
-rw-r--r--bs4/diagnose.py23
-rw-r--r--bs4/tests/test_soup.py1
3 files changed, 33 insertions, 13 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 0dded3a..4e268ef 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -208,7 +208,7 @@ class BeautifulSoup(Tag):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
- self.currentData = []
+ self.current_data = []
self.currentTag = None
self.tagStack = []
self.pushTag(self)
@@ -244,21 +244,21 @@ class BeautifulSoup(Tag):
self.currentTag = self.tagStack[-1]
def endData(self, containerClass=NavigableString):
- if self.currentData:
- currentData = u''.join(self.currentData)
- if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ if self.current_data:
+ current_data = u''.join(self.current_data)
+ if (current_data.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.builder.preserve_whitespace_tags)):
- if '\n' in currentData:
- currentData = '\n'
+ if '\n' in current_data:
+ current_data = '\n'
else:
- currentData = ' '
- self.currentData = []
+ current_data = ' '
+ self.current_data = []
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
- not self.parse_only.search(currentData)):
+ not self.parse_only.search(current_data)):
return
- o = containerClass(currentData)
+ o = containerClass(current_data)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
@@ -328,7 +328,7 @@ class BeautifulSoup(Tag):
self._popToTag(name, nsprefix)
def handle_data(self, data):
- self.currentData.append(data)
+ self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index f9bff28..b6eaa8d 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -1,10 +1,15 @@
"""Diagnostic functions, mainly for use when doing tech support."""
+import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser
+import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
+
import os
+import pstats
import random
+import tempfile
import time
import traceback
import sys
@@ -174,5 +179,21 @@ def benchmark_parsers(num_elements=100000):
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
+def profile(num_elements=100000, parser="lxml"):
+
+ filehandle = tempfile.NamedTemporaryFile()
+ filename = filehandle.name
+
+ data = rdoc(num_elements)
+ vars = dict(bs4=bs4, data=data, parser=parser)
+ cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+
+ stats = pstats.Stats(filename)
+ stats.strip_dirs()
+ cumulative = stats.sort_stats("cumulative")
+ total = stats.sort_stats("time")
+ import pdb; pdb.set_trace()
+
if __name__ == '__main__':
- diagnose(sys.stdin.read())
+ #diagnose(sys.stdin.read())
+ profile()
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 6219b89..910b37e 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -96,7 +96,6 @@ class TestWarnings(SoupTest):
soup = self.soup("http://www.crummy.com/ is great")
self.assertEqual(0, len(w))
-
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):