USharing
开放博客

Python爬虫常用工具

字符串仅保留英文、中文、数字、中文符号、英文符号:

import re
> 字符串去除转义符
def finstring(string):
    s = ''.join(re.findall(
        '[\u4e00-\u9fa5]+|\d+|[a-zA-Z]+|[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+|[\】\【\,\|\!\#\@\$\%\*\/\.\ \[\]\{\}\>\<\\\)\(\^\"\'\;\:]+',
        string))
    return s

将秒切换为时间格式:

import datetime
def shijian(video_length):
    times = datetime.timedelta(seconds=video_length)
    return times

时间戳切换为时间:

def zh_ctime(timestamp):
    # 转换成localtime
    time_local = time.localtime(timestamp)
    # 转换成新的时间格式
    dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
    return dt

对比两个字符串的相似度,返回相似值(最大长度支持到100):

import difflib
def strduibi(str1, str2):
    percentage = 100 - len(str1)     
    str1, str2 = str1 + '*' * percentage, str2 + '*' * percentage
    strnum = difflib.SequenceMatcher(None, str1, str2).quick_ratio()
    return int(strnum * 100)

对比两个字符串的相似度,返回相似百分比:

import difflib
def strduibi_lv(str1, str2):
    # percentage = 100 - len(str1)
    # str1, str2 = str1 + '*' * percentage, str2 + '*' * percentage
    strnum = difflib.SequenceMatcher(None, str1, str2).quick_ratio()
    return int(strnum * 100)

快速转换headers,字符串格式转换字典:

import re
def str2dict(formstr):
    form = {}
    r = re.findall(r'(.*?):(.*?)\n', formstr)
    for i in r:
        form[i[0].strip()] = i[1].strip()
    return form

时间格式切换为妙:

def miao(video_length):
    l = video_length.split(':')
    miao = ''
    if len(l) == 3:
        miao = int(l[0]) * 3600 + int(l[1]) * 60 + int(l[2])
    elif len(l) <= 2:
        miao = int(l[0]) * 60 + int(l[1])
    return miao

Mysql数据库查重语句,按表中两个重复字段查重:

SELECT * FROM table WHERE
    tort_url IN ( SELECT tort_url FROM ( SELECT tort_url FROM table WHERE tort_url NOT LIKE '%123456%' ) AS tmp1 ) 
    AND detection_time not IN ( 
SELECT detection_time FROM ( SELECT detection_time FROM table WHERE is_check IS NOT NULL) AS tmp12 ) 
    AND tort_title NOT IN ( 
SELECT tort_title FROM ( SELECT tort_title FROM table GROUP BY 字段1, 字段2 HAVING COUNT( * ) = 1 ) AS tmp2)
    AND tort_putdate NOT IN (
SELECT tort_putdate FROM ( SELECT tort_putdate FROM table GROUP BY tort_putdate, tort_title HAVING COUNT( * ) = 1 ) AS tmp3);

Mysql数据库连接语句:

import re
import time
import pymysql

class Mysql_:
    def __init__(self, database, root, password, localhost='localhost'):
        self.database = database
        self.root = root
        self.password = password
        self.localhost = localhost
        self.db = None
        self.cur = None

    def __connect(self):
        self.db = pymysql.connect(
            host=self.localhost,
            port=3306,
            user=self.root,
            password=self.password,
            database=self.database,
            charset='utf8'
        )
        self.cur = self.db.cursor()

    def __close(self):
        self.cur.close()
        self.db.close()

    def add_del_upd(self, sql, params):
        self.__connect()
        try:
            self.cur.execute(sql, params)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
            print(e)
        finally:
            self.__close()

    def add_del_upd_s(self, sql, params):
        self.__connect()
        num = 0
        try:
            self.cur.executemany(sql, params)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
            print(e)
        finally:
            self.__close()
            return num

    def select_all(self, sql, **params):
        self.__connect()
        try:
            self.cur.execute(sql, params)
            result = self.cur.fetchall()
            return result
        except Exception as e:
            self.db.rollback()
            print(e)

    def select_one(self, sql, **params):
        self.__connect()
        try:
            self.cur.execute(sql, params)
            result = self.cur.fetchone()
            return result
        except Exception as e:
            self.db.rollback()
            print(e)

    def add_del_s(self, sql):
        self.__connect()
        num = 0
        try:
            self.cur.execute(sql)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
            print(e)
        finally:
            self.__close()
            return num


if __name__ == '__main__':
      # 实例化
      mysql = Mysql_("database", 'user', '123456', '127.0.0.1')
      sql = "SELECT * FROM table_name"      # 查询语句
      mysql.select_all(sql)      # 查询符合条件的所有数据
      mysql.select_one(sql)      # 查询符合条件的一条数据
      sql = "SELETE FROM table_name WHERE xxx"      # 删除语句
      mysql.add_del_s(sql)     # 删除/修改符合条件的所有数据
      sql = "INSTER INTO table (字段名1,字段名2) values (%s,%s)"
      mysql.add_del_upd(sql,parmes)     # 插入语句,parmes是一个元组,或列表值

 

赞(0) 打赏
未经允许不得转载:USharing » Python爬虫常用工具

觉得文章有用就打赏一下文章作者

微信扫一扫打赏