This script counts number of top N queries by label.
label means:
- lable 0 ( all queries are regarded as “.” )
.
- label 1
.com
- lable 2
.foo.com
usage :
python count_queries_by_label_topN.py <lable> <top N> <query log>
query log format.
# head -5 a.txt
local.
local.
www.google.com.
ns2.c0.com.
daisy.ubuntu.com.
|
label 0, top 3
# ./count_queries_by_label_topN.py 0 3 a.txt
('.', 2081)
|
label 1, top 3
# ./count_queries_by_label_topN.py 1 3 a.txt
('.com', 1126)
('.localdomain', 632)
('.', 7)
|
label 2, top 3
# ./count_queries_by_label_topN.py 2 3 a.txt
('.ubuntu.com', 576)
('.c0.com', 435)
('.google.com', 106)
|
Here is the script I wrote.
There must be more efficient way than this.
# cat -n count_queries_by_label_topN.py
1 #!/usr/bin/env python
2
3 import sys
4 from operator import itemgetter
5
6 dict1 = {}
7 list1 = []
8
9 def count_query(label, topN, querylog):
10 a = []
11
12 with open('%s' % querylog, 'r') as f:
13 data = f.readlines()
14 for line in data:
15 tmp = line.split(".")
16
17 # label 2 means .foo.com
18 # label 3 means .sub.foo.com
19 # tmp2 = "." + ".".join(tmp[-3:-1])
20 c = int(label) * -1 + (-1)
21 tmp2 = "." + ".".join(tmp[c:len(tmp)-1])
22 a.append(tmp2)
23
24 # eliminate duplicates
25 b = set(a)
26
27 # count duplicates queries
28 for i in b:
29 # print [a.count(i), a.index(i), a[a.index(i)]]
30 #print {a.index(i), a.count(i)}
31 dict1[a[a.index(i)]] = a.count(i)
32
33 # sort dictionary by values
34 for i in sorted(dict1.items(), key=itemgetter(1), reverse=True):
35 #print i
36 list1.append(i)
37
38 # output
39 if len(list1) == 1:
40 print list1[0]
41 else:
42 for i in range(0, int(topN)):
43 print list1[i]
44
45 if __name__ == '__main__':
46 argvs = sys.argv
47 argc = len(argvs)
48
49 if argc != 4:
50 print "Oops :("
51 print "Usage python count_queries_by_label_topN.py <label> <top N> <query file>"
52 elif int(argvs[1]) < 0:
53 print "Oops :("
54 print "Please specify label greater than or equal to zero"
55 elif int(argvs[2]) < 1:
56 print "Oops :("
57 print "Please specify top N greater than or equal to one"
58 else:
59 count_query(argvs[1], argvs[2], argvs[3])
|
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.