判断记录是否存在然后选择是插入还是更新

mysql9年前 (2017)发布 admin

490 0 0

安装方式：命令行下执行
pip install DBUtils
PooledDB的参数：
1. mincached，最少的空闲连接数，如果空闲连接数小于这个数，pool会创建一个新的连接
2. maxcached，最大的空闲连接数，如果空闲连接数大于这个数，pool会关闭空闲连接
3. maxconnections，最大的连接数，
4. blocking，当连接数达到最大的连接数时，在请求连接的时候，如果这个值是True，请求连接的程序会一直等待，直到当前连接数小于最大连接数，如果这个值是False，会报错，
5. maxshared 当连接数达到这个数，新请求的连接会分享已经分配出去的连接

在uwsgi中，每个http请求都会分发给一个进程，连接池中配置的连接数都是一个进程为单位的（即上面的最大连接数，都是在一个进程中的连接数），而如果业务中，一个http请求中需要的sql连接数不是很多的话（其实大多数都只需要创建一个连接），配置的连接数配置都不需要太大。
连接池对性能的提升表现在：
1.在程序创建连接的时候，可以从一个空闲的连接中获取，不需要重新初始化连接，提升获取连接的速度
2.关闭连接的时候，把连接放回连接池，而不是真正的关闭，所以可以减少频繁地打开和关闭连接
以上来自http://www.cnblogs.com/Xjng/p/3437694.html

#-*-coding:utf-8-*-
import urllib.request
from bs4 import BeautifulSoup
import MySQLdb
import datetime
import re
import urllib.request as urlreq
from urllib import error
from DBUtils.PooledDB import PooledDB #数据库连接池
pool = PooledDB(MySQLdb,5,host='192.168.253.138',user='chi_na_cn',passwd='3ZpKnZWBwN',db='chi_na_cn',port=3306) #5为连接池里的最少

url_text = "/forum.php?mod=viewthread&tid=307931&extra=page%3D1&mobile=2"
url_list = "/forum.php?mod=forumdisplay&fid=43&page=2&mobile=2"
url=url_list
def getPage(url):
    '''url:目标网址URL
    return：HTML文档
    模拟手机浏览器
    '''
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Mobile Safari/537.36'}
        req = urlreq.Request(url,headers=headers)
        html = urlreq.urlopen(req).read().decode('utf-8')
        return html
    except urlreq.HTTPError as e:
        return None
        #print(e)

'''获取内容'''
def cj_text(url):
    try:
        soup = BeautifulSoup(getPage(url), "html.parser")  # 加上解释器不弹错误代码
        title = soup.find("h2")
        neirong=soup.find(class_="message")
        zuoze=soup.find(class_="authi")
        zz=zuoze.b.a.text

        text=repr(neirong.text.strip().replace(' ', '').replace('\n', '').replace('\t', '').replace('\r', '').strip().replace(' ', '').replace('\xa0', ''))#打印不可见字符
        title=repr(title.text.strip().replace(' ', '').replace('\n', '').replace('\t', '').replace('\r', '').strip().replace(' ', ''))
        fb_time = re.findall(r'20\d{2}-\d+-\d+\s\d{2}:\d{2}:\d{2}', str(zuoze))
        return title,zz,text,fb_time[0]
    except AttributeError as e:
        #print('对象没有这个属性')
        return None
def cj_list(url):
    result = []
    str='http://www.xxxxx.com/'
    soup = BeautifulSoup(getPage(url), "html.parser")
    str_all = soup.find_all(class_="threadlist")
    for info in str_all:
        all = info.find_all('a')
        for item in all:
            if len(item.get("href")) > 20:
                url = item.get("href")
                result.append(str+url)
    return result
    '''
    制定栏目页连接，
    获取当前栏目的第一页入了列表，，加入url库
    获取下一页连接，重复上一过程

    '''
def cj_list_next(url):
    '''获取下一页连接'''
    str = 'http://www.huolinhe.com/'
    soup = BeautifulSoup(getPage(url), "html.parser")  # 加上解释器不弹错误代码
    content = soup.find_all(class_='nxt')
    try:
        for item in content:
            urls = item.get("href")
        return str+urls
    except UnboundLocalError as e:
        return None
def cj_pass(str):
#判断数据库中是否存在
    conn=pool.connection()
    cursor = conn.cursor()
    sql_cmd = '''select 1 from urls where url = '%s' ''' %str
    cursor.execute(sql_cmd)
# 获取所有记录列表
    results = cursor.fetchall()
    suc = False
    for row in results:
        if row[0] == 1:
            suc = True
        else:
            suc = False
    return suc

def url_ru(str):
    '''增加更新时间'''
    conn = pool.connection()
    cursor = conn.cursor()

    up_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    sql_insert = "insert into urls(url,up_date) values(%s,%s)"  # 采集url时不更新时间，采集内容后再更新时间
    for i in range(len(str) - 1, -1, -1):
        str[i] = (str[i], (up_date))
    try:
        cursor.executemany(sql_insert,list(str))
        conn.commit()
    except Exception as e:
        print(e)
    cursor.close()
    conn.close()
def cj_updata():
    #采集内容
    pass
if getPage(url)is None:
    print('返回值为空')
else:
    pass
    #print(cj_text(url_text))#采集内容
    #print(cj_list(url))#采集url列表
    #print(cj_list_next(url))#采集下一页连接

#if cj_list_next('/forum.php?mod=forumdisplay&fid=43&page=1000&mobile=2') is None:
#    print('采集结束')

num_list = cj_list(url_list)
print('本次采集URL',len(num_list),'条')
for i in range(len(num_list)-1, -1, -1):
    if cj_pass(num_list[i]):
        num_list.pop(i)
        #验证记录，存在则丢弃
    else:
        #入库记录
        print(num_list[i],'将入库')
if len(num_list)>0:
    url_ru(num_list)
else:
    print('未入库')