Here is how to calculate HTTP content size.
This script will fetch all images in a html file and calculate content size from HTTP content-length header.
This script uses beautifulsoup module, so install the module if you have not installed.
# pip install beautifulsoup4
|
Here is the script.
# cat HTTP_calc_content_size.py
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import re
i = []
j = []
# connect to the site
url = "http://jp.msn.com"
#url = "http://www.apple.com"
#url = "http://www.amazon.co.jp"
#url = "http://www.yahoo.co.jp"
#url = "http://www.google.co.jp"
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
# find img tags
elements = soup.findAll('img')
for element in elements:
if re.match('^http', element['src']):
# print element['src']
j.append(element['src'])
else:
# print "%s%s" % (url, element['src'])
j.append("%s%s" % (url, element['src']))
# fetch all images and retrieve content-length header
for tmp in j:
conn = urllib2.urlopen(tmp, timeout=3)
i.append(int(conn.headers['content-length']))
# calculate average content size
average = (sum(i)/(len(i)*1.0))
# print results
print "site name : %s" % url
print "number of objects : %s" % len(i)
print "average : %d bytes" % average
print "max : %d bytes" % max(i)
print "minimum : %d bytes" % min(i)
|
Here is outputs.
- msn
# ./HTTP_calc_content_size.py
site name : http://jp.msn.com
number of objects : 51
average : 2540 bytes
max : 9270 bytes
minimum : 42 bytes
|
- amazon
# ./HTTP_calc_content_size.py
site name : http://www.amazon.co.jp
number of objects : 52
average : 8457 bytes
max : 77708 bytes
minimum : 43 bytes
|
- google
# ./HTTP_calc_content_size.py
site name : http://www.google.co.jp
number of objects : 1
average : 1834 bytes
max : 1834 bytes
minimum : 1834 bytes
|
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.