百木园-与人分享,
就是让自己快乐。

慕课网爬虫

\'\'\'本demo是爬慕课网,实战课下前端,后端,移动开发,云计算大数据,数据库,部分页面下,所有课程页面信息。

  代码有需要改进,请指出,谢谢。

\'\'\'# author:Administrator
2 # date:2021/04/30
3
4 import requests #第三方下载器
5 import re #正则表达式
6 import json #格式化数据用
7 from requests.exceptions import RequestException #做异常处理
8 from multiprocessing import Pool #使用多进程
9
10
11
12 def geturl(url):
13 try:
14 response = requests.get(url)
15 if response.status_code == 200:
16 return response.content.decode(\"utf-8\")
17 return None
18 except RequestException:
19 return None
20
21 homeurl=\'https://coding.imooc.com\'
22 #取慕课主页课程url 放入list
23 stuname_dict_url = {}
24 def parse_one_classUrl(html,stuname):
25 pattern = re.compile(\'.*?<a target=\"_blank\" href=\"(.*?)\">\',re.S)
26 items = re.findall(pattern,html)
27 #url 拼接
28 items = [homeurl + i for i in items]
29 stuname_dict_url[stuname] = items
30 return stuname_dict_url
31
32
33 # 正则匹配数据
34 def parse_one_page(html,url,stuname):
35 pattern = re.compile(
36 \'.*?<div class=\"title-box\">.*?<h1>(.*?)</h1>\'
37 \'.*?<span>难度</span>.*?<span class=\"nodistance\">(.*?)</span>\'
38 \'.*?<span>时长</span>.*?<span class=\"nodistance\">(.*?)</span>\'
39 \'.*?<span>学习人数</span>.*?<span class=\"nodistance\">(.*?)</span>\'
40 \'.*?<span>综合评分</span>.*?<span class=\"nodistance\">(.*?)</span>\'
41 ,re.S)
42 items = re.findall(pattern,html)
43 #定义个list 为了格式化
44 tup_items = items[0] + (url,stuname,)
45 list = []
46 list.append(tup_items)
47 for item in list:
48 # 格式化每一条数据为字典类型的数据
49 yield {
50 \'title\': item[0],
51 \'difficulty\': item[1],
52 \'duration\': item[2],
53 \'stu_number\': item[3],
54 \'comprehensive_evaluation\': item[4],
55 \'url\':item[5],
56 \'stuname\':item[6]
57 }
58
59 #获取课程urlList
60 def getClassurl(dict):
61 for class_type in dict:
62 for stuname in dict[class_type]:
63 url = geturl(dict[class_type][stuname])
64 #获取课程url 是一个字典类型 {name:[url]}
65 dic = parse_one_classUrl(url,stuname)
66 return dic
67
68 #写入文本
69 def write_to_file(name,content):
70 with open(\'..\\\\text\\%s.txt\' %name,\'a\',encoding=\'utf-8\') as f:
71 f.write(json.dumps(content,ensure_ascii=False)+\'\\n\')
72 f.close()
73
74
75 dict_qd = {\'前端\':{\'vus.js\':\'https://coding.imooc.com/?c=vuejs\',\'HTML/CSS\':\'https://coding.imooc.com/?c=html\',\'JavaScript\':\'https://coding.imooc.com/?c=javascript\',\'Node.js\':\'https://coding.imooc.com/?c=nodejs\'}}
76 dict_hd = {\'后端\':{\'java\':\'https://coding.imooc.com/?c=java\',\'SpringBoot\':\'https://coding.imooc.com/?c=springboot\',\'SpringCloud\':\'https://coding.imooc.com/?c=springcloud\'}}
77 dict_ydkf = {\'移动开发\':{\'android\':\'https://coding.imooc.com/?c=android\',\'ios\':\'https://coding.imooc.com/?c=ios\',\'Reactnative\':\'https://coding.imooc.com/?c=reactnative\'}}
78 dict_yun = {\'云计算大数据\':{\'hadoop\':\'https://coding.imooc.com/?c=hadoop\',\'大数据\':\'https://coding.imooc.com/?c=bigdata\',\'Spark\':\'https://coding.imooc.com/?c=spark\',\'Docker\':\'https://coding.imooc.com/?c=docker\'}}
79 dict_db = {\'数据库\':{\'mysql\':\'https://coding.imooc.com/?c=mysql\',\'redis\':\'https://coding.imooc.com/?c=redis\',\'mongodb\':\'https://coding.imooc.com/?c=mongodb\'}}
80
81 def main():
82 pool = Pool(processes=5)
83 #慕课课程url
84 url_dict = pool.apply_async(getClassurl,(dict_db,)).get()
85 for stuname in url_dict:
86 for url in url_dict[stuname]:
87 print(stuname,url)
88 classhtml = pool.apply_async(geturl,(url,)).get()
89 for item in parse_one_page(classhtml,url,stuname):
90 write_to_file(\"dict_db\",item)
91
92 pool.close()
93 pool.join()
94
95 if __name__ == \'__main__\':
96 main()

来源:https://www.cnblogs.com/zhangqifeng2021/p/14758084.html
图文来源于网络,如有侵权请联系删除。

未经允许不得转载:百木园 » 慕课网爬虫

相关推荐

  • 暂无文章