手机
当前位置:查字典教程网 >脚本专栏 >python >python读取html中指定元素生成excle文件示例
python读取html中指定元素生成excle文件示例
摘要:Python2.7编写的读取html中指定元素,并生成excle文件复制代码代码如下:#coding=gbkimportstringimpo...

Python2.7编写的读取html中指定元素,并生成excle文件

复制代码 代码如下:

#coding=gbk

import string

import codecs

import os,time

import xlwt

import xlrd

from bs4 import BeautifulSoup

from xlrd import open_workbook

class LogMsg:

def __init__(self,logfile,Level=0):

try:

import logging

#self.logger = None

self.logger = logging.getLogger()

self.hdlr = logging.FileHandler(logfile)

formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")

self.hdlr.setFormatter(formatter)

self.logger.addHandler(self.hdlr)

#logger.setLevel()

if Level == 10:

self.logger.setLevel(logging.DEBUG)

elif Level == 20:

self.logger.setLevel(logging.INFO)

elif Level == 30:

self.logger.setLevel(logging.WARNING)

elif Level == 40:

self.logger.setLevel(logging.ERROR)

elif Level == 50:

self.logger.setLevel(logging.CRITICAL)

else:

self.logger.setLevel(logging.NOTSET)

except:

print "log init error!"

exit(1)

def output(self,logInfo):

Level = self.logger.getEffectiveLevel()

try:

if Level == 10:

self.logger.debug(logInfo)

elif Level == 20:

self.logger.info(logInfo)

elif Level == 30:

self.logger.warning(logInfo)

elif Level == 40:

self.logger.error(logInfo)

elif Level == 50:

self.logger.critical(logInfo)

else:

self.logger.info(logInfo)

except:

print "log output error!"

exit(1)

def close(self):

try:

#logging.shutdown([self.hdlr])

self.logger.removeHandler(self.hdlr)

except:

print "log closed error!"

exit(1)

Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())

logFileTime = time.strftime("%Y%m%d",time.localtime())

Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime

log = LogMsg(Logfile,20)

DATAPATH = '/data/pyExample/'

XLSname = 'dangjian_'+Logtime+'.xls'

if __name__ == '__main__':

wbk = xlwt.Workbook(encoding = 'gbk')

sheet = wbk.add_sheet('基本内容导入模板')

sheet.write(0,0,'内容类型 ')

sheet.write(0,1,'栏目名称')

sheet.write(0,2,'栏目编号')

sheet.write(0,3,'内容名称')

sheet.write(0,4,'时长')

sheet.write(0,5,'关键字')

sheet.write(0,6,'看点')

sheet.write(0,7,'作者')

sheet.write(0,8,'来源')

sheet.write(0,9,'子内容1')

sheet.write(0,10,'子内容2')

xlsContent = []

files = os.listdir(DATAPATH)

k = 0

for f in files:

if os.path.splitext(f)[1] == '.html':

content=[]

log.output('当前文件:'+f)

htmlFile =codecs.open(DATAPATH+f,'r','gbk')

lines = htmlFile.readlines()

if not lines:

log.output ('not line')

for line in lines:

if line.strip()=='n':

log.output('该处是空行')

else:

line = line.replace('','')

soup = BeautifulSoup(line)

for tdd in soup.findAll('td'):

#print tdd.text.encode("gbk")

content.append(tdd.text.encode("gbk"))

#print line.encode('gbk')

htmlFile.close()

for i in content:

print content.index(i),',',i

log.output(i)

log.output(content.index(i))

print '----------------------------------------'

folderName = content[6]

contentName= content[4]

duration = filter(str.isdigit, content[16])

int_duration = string.atoi(duration)*60

str_duration = "%i"%int_duration

keyWord = content[6]

desciption = content[36]

videoName_1 = content[10]

print folderName

print contentName

print str_duration

print keyWord

print desciption

print videoName_1

log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')

print k

sheet.write(k+1,0,'')

sheet.write(k+1,1,folderName)

sheet.write(k+1,2,'')

sheet.write(k+1,3,contentName)

sheet.write(k+1,4,str_duration)

sheet.write(k+1,5,keyWord)

sheet.write(k+1,6,desciption)

sheet.write(k+1,7,'管理员')

sheet.write(k+1,8,'华数编辑')

sheet.write(k+1,9,videoName_1)

sheet.write(k+1,10,'')

k+=1

wbk.save(DATAPATH + XLSname)

print '========================================='

【python读取html中指定元素生成excle文件示例】相关文章:

使用python装饰器验证配置文件示例

使用python统计文件行数示例分享

python读文件逐行处理的示例代码分享

python将xml xsl文件生成html文件存储示例讲解

Python pass 语句使用示例

python调用cmd复制文件代码分享

python实现图片批量剪切示例

python读取csv文件示例(python操作csv)

python使用win32com在百度空间插入html元素示例

python定时器使用示例分享

精品推荐
分类导航