五、基于hadoop的nginx访问日志分析--userAgent和spider

时间:2022-05-06 09:10:15

useragent:

代码(不包含蜘蛛):

# cat top_10_useragent.py
#!/usr/bin/env python
# coding=utf-8 from mrjob.job import MRJob
from mrjob.step import MRStep
from nginx_accesslog_parser import NginxLineParser import heapq class UserAgent(MRJob): nginx_line_parser = NginxLineParser() def mapper(self, _, line): self.nginx_line_parser.parse(line)
field_item = self.nginx_line_parser.http_user_agent
if field_item is not None:
yield field_item, 1 def reducer_sum(self, key, values): yield None, (sum(values), key) def reducer_top100(self, _, values):
for count, path in heapq.nlargest(10, values):
yield count, path
# for count, path in sorted(values, reverse=True)[:10]:
# yield count, path def steps(self):
return (
MRStep(mapper=self.mapper,
reducer=self.reducer_sum
),
MRStep(reducer=self.reducer_top100)
) def main():
UserAgent.run() if __name__ == '__main__':
main()

结果:

# python3 top_10_useragent.py access_all.log-20161227
No configs found; falling back on auto-configuration
Creating temp directory /tmp/top_10_useragent.root.20161228.090725.308144
Running step 1 of 2...
Running step 2 of 2...
Streaming final output from /tmp/top_10_useragent.root.20161228.090725.308144/output...
85262 "IE"
79611 "Chrome"
48560 "Other"
10662 "Firefox"
7927 "Mobile Safari UI/WKWebView"
7182 "Sogou Explorer"
6681 "QQ Browser"
1988 "Mobile Safari"
1781 "Maxthon"
1404 "Edge"
Removing temp directory /tmp/top_10_useragent.root.20161228.090725.308144...

蜘蛛:

#!/usr/bin/env python
# coding=utf-8 from mrjob.job import MRJob
from mrjob.step import MRStep
from nginx_accesslog_parser import NginxLineParser import heapq class Spider(MRJob): nginx_line_parser = NginxLineParser() def mapper(self, _, line): self.nginx_line_parser.parse(line)
field_item = self.nginx_line_parser.user_agent_type
if field_item is not None:
yield field_item, 1 def reducer_sum(self, key, values): yield None, (sum(values), key) def reducer_top100(self, _, values):
for count, path in heapq.nlargest(10, values):
yield count, path
# for count, path in sorted(values, reverse=True)[:10]:
# yield count, path def steps(self):
return (
MRStep(mapper=self.mapper,
reducer=self.reducer_sum
),
MRStep(reducer=self.reducer_top100)
) def main():
Spider.run() if __name__ == '__main__':
main()

执行结果:

# python3 top_10_spider.py access_all.log-20161227
No configs found; falling back on auto-configuration
Creating temp directory /tmp/top_10_spider.root.20161228.091326.295972
Running step 1 of 2...
Running step 2 of 2...
Streaming final output from /tmp/top_10_spider.root.20161228.091326.295972/output...
33542 "magpie-crawler"
25880 "Other"
16578 "Sogou web spider"
6383 "bingbot"
3688 "Baiduspider"
1487 "Yahoo! Slurp"
1096 "JikeSpider"
731 "YisouSpider"
648 "Baiduspider-image"
470 "Googlebot"
Removing temp directory /tmp/top_10_spider.root.20161228.091326.295972...