python : 3.5 jdk : 1.7 eclipse : 4.5.2(有点低了,需要对应Neon 4.6,不然总是会弹出提示框) |
MySqlDB官网只支持Python3.4,这里Python3.5使用第三方库PyMysql连接Mysql数据库。
PyMysql下载地址:
Windows下安装方法:
下载解压后,进入PyMySql-0.6.7目录,执行python setup.py install安装
test1.py
1 import urllib.request as request 2 def baidu_tieba(url, begin_page, end_page): 3 for i in range(begin_page, end_page + 1): 4 sName = 'D:/360Downloads/test/'+str(i).zfill(5)+'.html' 5 print('正在下载第'+str(i)+'个页面, 并保存为'+sName) 6 m = request.urlopen(url+str(i)).read() 7 with open(sName,'wb') as file: 8 file.write(m) 9 file.close()10 if __name__ == "__main__":11 url = "http://tieba.baidu.com/p/"12 begin_page = 113 end_page = 314 baidu_tieba(url, begin_page, end_page)
test2.py
1 import urllib.request as request 2 import re 3 import os 4 import urllib.error as error 5 def baidu_tieba(url, begin_page, end_page): 6 count = 1 7 for i in range(begin_page, end_page + 1): 8 sName = 'D:/360Downloads/test/' + str(i).zfill(5) + '.html' 9 print('正在下载第' + str(i) + '个页面, 并保存为' + sName)10 m = request.urlopen(url + str(i)).read()11 # 创建目录保存每个网页上的图片12 dirpath = 'D:/360Downloads/test/'13 dirname = str(i)14 new_path = os.path.join(dirpath, dirname)15 if not os.path.isdir(new_path):16 os.makedirs(new_path)17 page_data = m.decode('gbk', 'ignore') 18 page_image = re.compile('
test3.py
1 #python3.4 爬虫教程 2 #爬取网站上的图片 3 #林炳文Evankaka(博客:http://blog.csdn.net/evankaka/) 4 import urllib.request 5 import socket 6 import re 7 import sys 8 import os 9 targetDir = r"D:\PythonWorkPlace\load" #文件保存路径10 def destFile(path): 11 if not os.path.isdir(targetDir): 12 os.makedirs(targetDir) 13 pos = path.rindex('/') 14 t = os.path.join(targetDir, path[pos+1:]) 15 print(t)16 return t 17 if __name__ == "__main__": #程序运行入口18 weburl = "http://www.douban.com/"19 webheaders = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 20 req = urllib.request.Request(url=weburl, headers=webheaders) #构造请求报头21 webpage = urllib.request.urlopen(req) #发送请求报头22 contentBytes = webpage.read() 23 for link, t in set(re.findall(r'(https:[^\s]*?(jpg|png|gif))', str(contentBytes))): #正则表达式查找所有的图片24 print(link)25 try: 26 urllib.request.urlretrieve(link, destFile(link)) #下载图片27 except:28 print('失败') #异常抛出
test4.py
1 ''' 2 第一个示例:简单的网页爬虫 3 4 爬取豆瓣首页 5 ''' 6 7 import urllib.request 8 9 #网址10 url = "http://bj.58.com/caishui/28707491160259x.shtml?adtype=1&entinfo=28707491160259_0&adact=3&psid=156713756196890928513274724"11 12 #请求13 request = urllib.request.Request(url)14 15 #爬取结果16 response = urllib.request.urlopen(request)17 18 data = response.read()19 20 #设置解码方式21 data = data.decode('utf-8')22 23 #打印结果24 print(data)25 26 #打印爬取网页的各类信息27 28 # print(type(response))29 # print(response.geturl())30 # print(response.info())31 # print(response.getcode())
test5.py
1 #!/usr/bin/env python 2 #-*-coding: utf-8 -*- 3 import re 4 import urllib.request as request 5 from bs4 import BeautifulSoup as bs 6 import csv 7 import os 8 import sys 9 from imp import reload 10 reload(sys)11 12 def GetAllLink():13 num = int(input("爬取多少页:>"))14 if not os.path.exists('./data/'):15 os.mkdir('./data/')16 17 for i in range(num):18 if i+1 == 1:19 url = 'http://nj.58.com/piao/'20 GetPage(url, i)21 else:22 url = 'http://nj.58.com/piao/pn%s/' %(i+1)23 GetPage(url, i)24 25 26 def GetPage(url, num):27 Url = url28 user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'29 headers = { 'User-Agent' : user_agent }30 req = request.Request(Url, headers = headers)31 page = request.urlopen(req).read().decode('utf-8')32 soup = bs(page, "html.parser")33 table = soup.table34 tag = table.find_all('tr')35 # 提取出所需的那段36 soup2 = bs(str(tag), "html.parser")37 title = soup2.find_all('a','t') #标题与url 38 price = soup2.find_all('b', 'pri') #价格39 fixedprice = soup2.find_all('del') #原价40 date = soup2.find_all('span','pr25') #时间 41 42 atitle = []43 ahref = []44 aprice = []45 afixedprice = []46 adate = []47 48 for i in title:49 #print i.get_text(), i.get('href')50 atitle.append(i.get_text())51 ahref.append(i.get('href'))52 for i in price:53 #print i.get_text()54 aprice.append(i.get_text())55 for i in fixedprice:56 #print j.get_text()57 afixedprice.append(i.get_text())58 for i in date:59 #print i.get_text()60 adate.append(i.get_text())61 62 csvfile = open('./data/ticket_%s.csv'%num, 'w')63 writer = csv.writer(csvfile)64 writer.writerow(['标题','url','售价','原价','演出时间'])65 '''66 每个字段必有title,但是不一定有时间date67 如果没有date日期,我们就设为'---'68 '''69 if len(atitle) > len(adate):70 for i in range(len(atitle) - len(adate)):71 adate.append('---')72 for i in range(len(atitle) - len(afixedprice)):73 afixedprice.append('---')74 for i in range(len(atitle) - len(aprice)):75 aprice.append('---')76 77 for i in range(len(atitle)):78 message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]79 writer.writerow([i for i in str(message).split('|')])80 print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))81 csvfile.close()82 83 84 if __name__ == '__main__':85 GetAllLink()
test6.py
1 #!/usr/bin/env python 2 #-*-coding: utf-8 -*- 3 import urllib.request as request 4 from bs4 import BeautifulSoup as bs 5 import sys 6 from imp import reload 7 reload(sys) 8 9 def GetAllLink():10 num = int(input("爬取多少页:>"))11 12 for i in range(num):13 if i+1 == 1:14 url = 'http://bj.58.com/caishui/?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&jump=1&specialtype=gls'15 GetPage(url, i)16 else:17 url = 'http://bj.58.com/caishui/pn%s/'%(i+1)+'?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&specialtype=gls&PGTID=0d30215f-0000-1941-5161-367b7a641048&ClickID=4' 18 GetPage(url, i)19 20 21 def GetPage(url, num):22 Url = url23 user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'24 headers = { 'User-Agent' : user_agent }25 req = request.Request(Url, headers = headers)26 page = request.urlopen(req).read().decode('utf-8')27 soup = bs(page, "html.parser")28 table = soup.table29 tag = table.find_all('tr')30 31 # 提取出所需的那段32 soup2 = bs(str(tag), "html.parser")33 34 title = soup2.find_all('a','t') #标题与url 35 companyName = soup2.find_all('a','sellername') #公司名称36 37 atitle = []38 ahref = []39 acompanyName = []40 41 for i in title:42 atitle.append(i.get_text())43 ahref.append(i.get('href'))44 for i in companyName:45 acompanyName.append(i.get_text())46 for i in range(len(ahref)):47 getSonPage(str(ahref[i]))48 49 50 def getSonPage(url): 51 Url = url52 user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'53 headers = { 'User-Agent' : user_agent }54 req = request.Request(Url, headers = headers)55 page = request.urlopen(req).read().decode('utf-8')56 soup = bs(page, "html.parser")57 print("=========================")58 #类别59 print(soup.find('div','su_con').get_text())60 #服务区域61 print(soup.find('div','su_con quyuline').get_text())62 #联 系 人63 print(soup.find_all('ul','suUl')[0].find_all('li')[2].find_all('a')[0].get_text())64 #商家地址65 print(soup.find_all('ul','suUl')[0].find_all('li')[3].find('div','su_con').get_text().replace("\n",'').replace("\r",'').replace('\t','').replace(' ',''))66 #服务项目67 print(soup.find('article','description_con').get_text().replace("_____________________________________","\n\r").replace("___________________________________","\n\r").replace("(以下为公司北京区域分布图)",""))68 print("=========================")69 70 if __name__ == '__main__':71 GetAllLink()
test7.py
1 import pymysql2 conn = pymysql.connect(host='192.168.1.102', port=3306,user='root',passwd='123456',db='test',charset='UTF8')3 cur = conn.cursor()4 cur.execute("select version()")5 for i in cur:6 print(i)7 cur.close()8 conn.close()