手机
当前位置:查字典教程网 >编程开发 >ASP教程 >写了段批量抓取某个列表页的东东
写了段批量抓取某个列表页的东东
摘要:有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙真是的!可能偶下边这段东西比较烂哈下边这个没有写入库功能,已经到这一步...

有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙 真是的!可能偶下边这段东西比较烂哈

下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果

Dim Url,List_PageCode,Array_ArticleID,i,ArticleID

Dim Content_PageCode,Content_TempCode

Dim Content_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName

Dim ArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent

Url = "http://www.webasp.net/article/class/1.htm"

List_PageCode = getHTTPPage(Url)

List_PageCode = RegExpText(List_PageCode,"打印</th></tr>","</table><table border=0 cellpadding=5",0)

List_PageCode = RegExpText(List_PageCode,"<td align=left><a href='../","'><img border=0 src='../images/authortype0.gif'",1) '取得当前列表页的文章链接,以,分隔

Array_ArticleID = Split(List_PageCode,",") '创建数组,存储文章ID

For i=0 To Ubound(Array_ArticleID)-1

ArticleID = Array_ArticleID(i) '文章ID

Content_PageCode = getHTTPPage("http://www.webasp.net/article/"&ArticleID) '取得文章页的内容

'=========取文章分类及相关ID参数 开始=======================

Content_TempCode = RegExpText(Content_PageCode,"<a href=""/article/"">技术教程</a> >> ",">> 内容</td>",0)

Content_CategoryID = RegExpText(Content_PageCode,"<a href='../class","/'>",1)

BorderID = Split(Content_CategoryID,",")(0) '大类ID

ClassID = Split(Content_CategoryID,",")(1) '子类ID

'==========检查大类是否存在 开始===============

'如果不存在则入库

'==========检查大类是否存在 结束===============

'Response.Write(BorderID & "," & ClassID & "<br />")

Content_CategoryName = RegExpText(Content_PageCode,"/'>","</a>",1)

BorderName = Split(Content_CategoryName,",")(0) '大类名称

ClassName = Split(Content_CategoryName,",")(1) '子类名称

'==========检查子类是否存在 开始===============

'如果不存在则入库

'==========检查子类是否存在 结束===============

'=========取文章分类及相关ID参数 结束=======================

'=========取文章标题及内容 开始=============================

ArticleTitle = RegExpText(Content_PageCode,"<tr><td align=center bgcolor=#DEE2F5><strong>","</strong></td></tr>",0)

ArticleAuthor = RegExpText(Content_PageCode,"<tr><td><span class=blue>作者:</span>","</td></tr>",0)

ArticleFrom = RegExpText(Content_PageCode,"<tr><td><span class=blue>来源:</span>","</td></tr>",0)

ArticleContent = RegExpText(Content_PageCode,"<tr><td class=contentWORD-WRAP: break-word"" id=zoom>","</td></tr>"&VBCrlf&" </table>"&VBCrlf&" </td></tr></table>",0)

'=========取文章标题及内容 结束=============================

Response.Write(ArticleTitle& "<br /><br />")

Response.Flush()

Next

附几个函数:

Function getHTTPPage(url)

IF(IsObjInstalled("Microsoft.XMLHTTP") = False)THEN

Response.Write "<br><br>服务器不支持Microsoft.XMLHTTP组件"

Err.Clear

Response.End

END IF

On Error Resume Next

Dim http

SET http=Server.CreateObject("Msxml2.XMLHTTP")

Http.open "GET",url,False

Http.send()

IF(Http.readystate<>4)THEN

Exit Function

END IF

getHTTPPage=BytesToBSTR(Http.responseBody,"GB2312")

SET http=NOTHING

IF(Err.number<>0)THEN

Response.Write "<br><br>获取文件内容出错"

'Response.End

Err.Clear

END IF

End Function

Function BytesToBstr(CodeBody,CodeSet)

Dim objStream

SET objStream = Server.CreateObject("adodb.stream")

objStream.Type = 1

objStream.Mode =3

objStream.Open

objStream.Write CodeBody

objStream.Position = 0

objStream.Type = 2

objStream.Charset = CodeSet

BytesToBstr = objStream.ReadText

objStream.Close

SET objStream = NOTHING

End Function

'================================================

'作 用:检查组件是否已经安装

'返回值:True ----已经安装

' False ----没有安装

'================================================

Function IsObjInstalled(objName)

On Error Resume Next

IsObjInstalled = False

Err = 0

Dim testObj

SET testObj = Server.CreateObject(objName)

IF(0 = Err)THEN IsObjInstalled = True

SET testObj = NOTHING

Err = 0

End Function

Function RegExpText(strng,strStart,strEnd,n)

Dim regEx,Match,Matches,RetStr

SET regEx = New RegExp

regEx.Pattern = strStart&"([sS]*?)"&strEnd

regEx.IgnoreCase = True

regEx.Global = True

SET Matches = regEx.Execute(strng)

For Each Match in Matches

IF(n=1)THEN

RetStr = RetStr & regEx.Replace(Match.Value,"$1") & ","

ELSE

RetStr = RetStr & regEx.Replace(Match.Value,"$1")

END IF

Next

RegExpText = RetStr

SET regEx=NOTHING

End Function

【写了段批量抓取某个列表页的东东】相关文章:

ASP.NET实现单点登陆(SSO)适用于哪些情况?

实例讲解实现抓取网上房产信息的ASP程序

ASP中通过该日历算法实现的具体代码

方便的大家admin及admin888 经过 md5加密后16位和32位代码

ASP编程入门进阶(七):内置对象Server

信息发布中的判断过期和有效期的东西

取得表单提交的所有数据

在不刷新页面的情况下调用远程asp脚本

记录集内随机取记录的代码

ASP代码实现access随机显示不重复记录

上一篇: 先锋海盗类
精品推荐
分类导航