博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python3入门教程
阅读量:5914 次
发布时间:2019-06-19

本文共 8799 字,大约阅读时间需要 29 分钟。

python : 3.5

jdk : 1.7

eclipse : 4.5.2(有点低了,需要对应Neon 4.6,不然总是会弹出提示框)

 

 

 

 

 

MySqlDB官网只支持Python3.4,这里Python3.5使用第三方库PyMysql连接Mysql数据库。

PyMysql下载地址:

Windows下安装方法:

下载解压后,进入PyMySql-0.6.7目录,执行python setup.py install安装

 

 

test1.py

1 import urllib.request as request 2 def baidu_tieba(url, begin_page, end_page): 3     for i in range(begin_page, end_page + 1): 4         sName = 'D:/360Downloads/test/'+str(i).zfill(5)+'.html' 5         print('正在下载第'+str(i)+'个页面, 并保存为'+sName) 6         m = request.urlopen(url+str(i)).read() 7         with open(sName,'wb') as file: 8             file.write(m) 9         file.close()10 if __name__ == "__main__":11     url = "http://tieba.baidu.com/p/"12     begin_page = 113     end_page = 314     baidu_tieba(url, begin_page, end_page)

 

test2.py

1 import urllib.request as request 2 import re 3 import os 4 import urllib.error as error 5 def baidu_tieba(url, begin_page, end_page): 6     count = 1 7     for i in range(begin_page, end_page + 1): 8         sName = 'D:/360Downloads/test/' + str(i).zfill(5) + '.html' 9         print('正在下载第' + str(i) + '个页面, 并保存为' + sName)10         m = request.urlopen(url + str(i)).read()11         # 创建目录保存每个网页上的图片12         dirpath = 'D:/360Downloads/test/'13         dirname = str(i)14         new_path = os.path.join(dirpath, dirname)15         if not os.path.isdir(new_path):16             os.makedirs(new_path)17         page_data = m.decode('gbk', 'ignore')   18         page_image = re.compile('

 

 

test3.py

1 #python3.4 爬虫教程 2 #爬取网站上的图片 3 #林炳文Evankaka(博客:http://blog.csdn.net/evankaka/) 4 import urllib.request   5 import socket   6 import re   7 import sys   8 import os   9 targetDir = r"D:\PythonWorkPlace\load"  #文件保存路径10 def destFile(path):  11     if not os.path.isdir(targetDir):  12         os.makedirs(targetDir)  13     pos = path.rindex('/')  14     t = os.path.join(targetDir, path[pos+1:])  15     print(t)16     return t  17 if __name__ == "__main__":  #程序运行入口18     weburl = "http://www.douban.com/"19     webheaders = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 20 req = urllib.request.Request(url=weburl, headers=webheaders) #构造请求报头21 webpage = urllib.request.urlopen(req) #发送请求报头22 contentBytes = webpage.read() 23 for link, t in set(re.findall(r'(https:[^\s]*?(jpg|png|gif))', str(contentBytes))): #正则表达式查找所有的图片24 print(link)25 try: 26 urllib.request.urlretrieve(link, destFile(link)) #下载图片27 except:28 print('失败') #异常抛出

 

 

test4.py

1 ''' 2 第一个示例:简单的网页爬虫 3   4 爬取豆瓣首页 5 ''' 6   7 import urllib.request 8   9 #网址10 url = "http://bj.58.com/caishui/28707491160259x.shtml?adtype=1&entinfo=28707491160259_0&adact=3&psid=156713756196890928513274724"11  12 #请求13 request = urllib.request.Request(url)14  15 #爬取结果16 response = urllib.request.urlopen(request)17  18 data = response.read()19  20 #设置解码方式21 data = data.decode('utf-8')22  23 #打印结果24 print(data)25  26 #打印爬取网页的各类信息27  28 # print(type(response))29 # print(response.geturl())30 # print(response.info())31 # print(response.getcode())

 

test5.py

1 #!/usr/bin/env python 2 #-*-coding: utf-8 -*- 3 import re 4 import urllib.request as request 5 from bs4 import BeautifulSoup as bs 6 import csv 7 import os 8 import sys 9 from imp import reload 10 reload(sys)11  12 def GetAllLink():13     num = int(input("爬取多少页:>"))14     if not os.path.exists('./data/'):15         os.mkdir('./data/')16      17     for i in range(num):18         if i+1 == 1:19             url = 'http://nj.58.com/piao/'20             GetPage(url, i)21         else:22             url = 'http://nj.58.com/piao/pn%s/' %(i+1)23             GetPage(url, i)24  25  26 def GetPage(url, num):27     Url = url28     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'29     headers = { 'User-Agent' : user_agent }30     req = request.Request(Url, headers = headers)31     page = request.urlopen(req).read().decode('utf-8')32     soup = bs(page, "html.parser")33     table = soup.table34     tag = table.find_all('tr')35     # 提取出所需的那段36     soup2 = bs(str(tag), "html.parser")37     title = soup2.find_all('a','t')         #标题与url 38     price = soup2.find_all('b', 'pri')      #价格39     fixedprice = soup2.find_all('del')      #原价40     date = soup2.find_all('span','pr25')    #时间 41  42     atitle = []43     ahref = []44     aprice = []45     afixedprice = []46     adate = []47  48     for i in title:49         #print i.get_text(), i.get('href')50         atitle.append(i.get_text())51         ahref.append(i.get('href'))52     for i in price:53         #print i.get_text()54         aprice.append(i.get_text())55     for i in fixedprice:56         #print j.get_text()57         afixedprice.append(i.get_text())58     for i in date:59         #print i.get_text()60         adate.append(i.get_text())61 62     csvfile = open('./data/ticket_%s.csv'%num, 'w')63     writer = csv.writer(csvfile)64     writer.writerow(['标题','url','售价','原价','演出时间'])65     '''66     每个字段必有title,但是不一定有时间date67     如果没有date日期,我们就设为'---'68     '''69     if len(atitle) > len(adate):70         for i in range(len(atitle) - len(adate)):71             adate.append('---')72         for i in range(len(atitle) - len(afixedprice)):73             afixedprice.append('---')74         for i in range(len(atitle) - len(aprice)):75             aprice.append('---')76             77     for i in range(len(atitle)):78             message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]79             writer.writerow([i for i in str(message).split('|')])80     print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))81     csvfile.close()82  83  84 if __name__ == '__main__':85     GetAllLink()

 

test6.py

1 #!/usr/bin/env python 2 #-*-coding: utf-8 -*- 3 import urllib.request as request 4 from bs4 import BeautifulSoup as bs 5 import sys 6 from imp import reload  7 reload(sys) 8   9 def GetAllLink():10     num = int(input("爬取多少页:>"))11      12     for i in range(num):13         if i+1 == 1:14             url = 'http://bj.58.com/caishui/?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&jump=1&specialtype=gls'15             GetPage(url, i)16         else:17             url = 'http://bj.58.com/caishui/pn%s/'%(i+1)+'?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&specialtype=gls&PGTID=0d30215f-0000-1941-5161-367b7a641048&ClickID=4' 18             GetPage(url, i)19  20  21 def GetPage(url, num):22     Url = url23     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'24     headers = { 'User-Agent' : user_agent }25     req = request.Request(Url, headers = headers)26     page = request.urlopen(req).read().decode('utf-8')27     soup = bs(page, "html.parser")28     table = soup.table29     tag = table.find_all('tr')30     31     # 提取出所需的那段32     soup2 = bs(str(tag), "html.parser")33     34     title = soup2.find_all('a','t')         #标题与url 35     companyName = soup2.find_all('a','sellername') #公司名称36  37     atitle = []38     ahref = []39     acompanyName = []40  41     for i in title:42         atitle.append(i.get_text())43         ahref.append(i.get('href'))44     for i in companyName:45         acompanyName.append(i.get_text())46     for i in range(len(ahref)):47         getSonPage(str(ahref[i]))48         49             50 def getSonPage(url): 51     Url = url52     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'53     headers = { 'User-Agent' : user_agent }54     req = request.Request(Url, headers = headers)55     page = request.urlopen(req).read().decode('utf-8')56     soup = bs(page, "html.parser")57     print("=========================")58     #类别59     print(soup.find('div','su_con').get_text())60     #服务区域61     print(soup.find('div','su_con quyuline').get_text())62     #联 系 人63     print(soup.find_all('ul','suUl')[0].find_all('li')[2].find_all('a')[0].get_text())64     #商家地址65     print(soup.find_all('ul','suUl')[0].find_all('li')[3].find('div','su_con').get_text().replace("\n",'').replace("\r",'').replace('\t','').replace(' ',''))66     #服务项目67     print(soup.find('article','description_con').get_text().replace("_____________________________________","\n\r").replace("___________________________________","\n\r").replace("(以下为公司北京区域分布图)",""))68     print("=========================")69  70 if __name__ == '__main__':71     GetAllLink()

 

test7.py

1 import pymysql2 conn = pymysql.connect(host='192.168.1.102', port=3306,user='root',passwd='123456',db='test',charset='UTF8')3 cur = conn.cursor()4 cur.execute("select version()")5 for i in cur:6     print(i)7 cur.close()8 conn.close()

 

你可能感兴趣的文章
日期字符串格式化
查看>>
ps基本操作
查看>>
JS-OO-数据属性,访问器属性
查看>>
unity3d 各键值对应代码
查看>>
10年程序员的一些人生感悟
查看>>
《统一沟通-微软-实战》-6-部署-7-部署移动功能-1
查看>>
ZEROCONF是什么
查看>>
【随便扯扯】Standby到底翻译成备用还是待机?
查看>>
信息系统项目管理师软考辅导——3年真题透解与全真模拟
查看>>
一个引号导致1个小时网站打不开
查看>>
从无到有,WebService Apache Axis2初步实践
查看>>
SQL Server 2012笔记分享-58:数据库文件管理2
查看>>
将字符串"123456"转换成"1,2,3,4,5,6"
查看>>
Jquery imgPreview demos
查看>>
Windows Universal 应用 – Tip Calculator
查看>>
LeetCode之Min Stack 实现最小栈
查看>>
Eclipse 的快捷键以及文档注释、多行注释的快捷键
查看>>
GridView视图(BaseAdapter)
查看>>
[Everyday Mathematics]20150117
查看>>
kill me heal me的链接
查看>>