手机
当前位置:查字典教程网 >编程开发 >ASP教程 >自己做采集程序
自己做采集程序
摘要:现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要...

现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。

首先去下载个XMLHTTP的类文件:

<%

Class xhttp

private cset,sUrl,sError

Private Sub Class_Initialize()

'cset="UTF-8"

cset="GB2312"

sError=""

end sub

Private Sub Class_Terminate()

End Sub

Public Property LET URL(theurl)

sUrl=theurl

end property

public property GET BasePath()

BasePath=mid(sUrl,1,InStrRev(sUrl,"/")-1)

end property

public property GET FileName()

FileName=mid(sUrl,InStrRev(sUrl,"/")+1)

end property

public property GET Html()

Html=BytesToBstr(getBody(sUrl))

end property

public property GET xhttpError()

xhttpError=sError

end property

private Function BytesToBstr(body)

on error resume next

'Cset:GB2312 UTF-8

dim objstream

set objstream = Server.CreateObject("adodb.stream")

with objstream

.Type = 1 '

.Mode = 3 '

.Open

.Write body '

.Position = 0 '

.Type = 2 '

.Charset = Cset '

BytesToBstr = .ReadText '

.Close

end with

set objstream = nothing

End Function

private function getBody(surl)

on error resume next

dim xmlHttp

'Set xmlHttp=server.createobject("Msxml2.XMLHTTP.4.0")

'set xmlHttp=server.createobject("Microsoft.XMLHTTP")

set xmlHttp=server.createobject("MSXML2.ServerXMLHTTP")

xmlHttp.setTimeouts 10000,10000,10000,30000

xmlHttp.open "GET",surl,false

xmlHttp.send

if xmlHttp.readystate=4 then

'if xmlHttp.status=200 then

getBody=xmlhttp.responsebody

'end if

else

getBody=""

end if

if Err.Number<>0 then

sError=Err.Number

Err.clear

else

sError=""

end if

set xmlHttp=nothing

end function

Public function saveimage(tofile,isoverwrite)

on error resume next

dim objStream,objFSO,imgs

if Not isoverwrite Then

Set objFSO = Server.CreateObject("Scripting.FileSystemObject")

If objFSO.FileExists(Server.MapPath(tofile)) Then

Exit Function

End If

Set objFSO = Nothing

End IF

imgs=getBody(sUrl)

Set objStream = Server.CreateObject("ADODB.Stream")

with objStream

.Type =1

.Open

.write imgs

.SaveToFile server.mappath(tofile),2

.Close()

end with

set objstream=nothing

end function

end class

%>

用了这个类文件,做起事情来就方便多了。

然后就可以分析采集网站的网页结构,写采集程序了。

下面给个例子:

<>

<>

<>

<%

server.ScriptTimeout = 1000

%>

<html>

<head>

<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />

<title>BT采集器</title>

</head>

<body>

<form name="form1" method="post" action="get81bt.asp">

分类ID:

<input type="text" name="cid" value="<%=request("cid")%>"><br>

开始ID:

<input type="text" name="startid" value="<%=request("startid")%>">

<br>

结束ID:

<input type="text" name="overid" value="<%=request("overid")%>">

<br>

分类名称:<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取

<br>

<input name="action" type="hidden" id="action" value="getdata">

<input type="submit" name="Submit" value="采集">

</form>

当前ID:<%=request("id")%> <br>

<%

dim action

action = Request("action")

if action = "getdata" then

cid = Request("cid")

startid = Request("startid")

overid = Request("overid")

id = Request("id")

if id = "" then id = startid

set objxhttp = new xhttp

objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm"

content = objxhttp.Html

if InStr(content,"网站维护中") then

call NextID

response.End()

end if

list = GetContent(content,"<>","<>",0)

Dim regEx, Match, Matches,patrn

Set regEx = New RegExp

patrn = "<a href=""../BtHtml/(.+?)"">"

regEx.Pattern = patrn

regEx.IgnoreCase = True

regEx.Global = True

Set Matches = regEx.Execute(list)

on error resume next

For Each Match in Matches

'response.write Match.Value & "<br>"

weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1")

response.write weburl & "<br>"

response.Flush()

objxhttp.URL = weburl

cpage = objxhttp.Html

cpage = GetContent(cpage,"<>","<>",0)

title = GetContent(cpage,"BT资源名称:<strong>","</strong>",0)

title = stripHTML(title)

IF Request("classname") <> "" then

classname = Request("classname")

Else

if InStr(title,"喜剧") then

classname = "喜剧"

Elseif InStr(title,"动作") then

classname = "动作"

Elseif InStr(title,"惊悚") then

classname = "惊悚"

Elseif InStr(title,"犯罪") then

classname = "犯罪"

Elseif InStr(title,"恐怖") then

classname = "恐怖"

Elseif InStr(title,"爱情") then

classname = "爱情"

Elseif InStr(title,"冒险") then

classname = "冒险"

Elseif InStr(title,"科幻") then

classname = "科幻"

Elseif InStr(title,"悬念") then

classname = "悬念"

Elseif InStr(title,"奇幻") then

classname = "奇幻"

Elseif InStr(title,"战争") then

classname = "战争"

Elseif InStr(title,"连续剧") then

classname = "连续剧"

Elseif InStr(title,"综艺") then

classname = "综艺"

Elseif InStr(title,"灾难") then

classname = "灾难"

Elseif InStr(title,"伦理") then

classname = "伦理"

Elseif InStr(title,"动漫") or InStr(title,"动画") then

classname = "动漫"

Elseif InStr(title,"国语") or InStr(title,"集") then

classname = "其他影视"

Else

classname = "其他"

End if

End IF

intro = GetContent(cpage,"<tr><td width=770 bgcolor=#FFFFFF><divmargin:10px;line-height:150%"">","</div>",0)

intro = Replace(intro,"<br />","[br]")

intro = Replace(intro,"<BR />","[br]")

intro = Replace(intro,"<BR>","[br]")

intro = Replace(intro,"<br>","[br]")

intro = Replace(intro,"<p>","[p]")

intro = Replace(intro,"<P>","[p]")

intro = Replace(intro,"</p>","[/p]")

intro = Replace(intro,"</P>","[p]")

intro = Replace(intro,"<img","[img")

intro = Replace(intro,"<IMG","[img")

intro = stripHTML(intro)

intro = Replace(intro,"[br]","<br>")

intro = Replace(intro,"[p]","<p>")

intro = Replace(intro,"[/p]","</p>")

intro = Replace(intro,"[img","<img")

intro = Replace(intro,"[img]","<img src=")

intro = Replace(intro,"[/img]",">")

intro = Replace(intro,"[IMG]","<img src=")

intro = Replace(intro,"[/IMG]",">")

'response.write t

'response.End()

addtime = Trim(GetContent(cpage,"发布时间:","",0))

if Not IsDate(addtime) then addtime = now()

username = "bt"

filesize = GetContent(content,"BT文件大小:","",0)

title2 = title

downurl = GetContent(cpage,"<acolor:red"" href=""","""",0)

p = CDate(addtime)

Dim sRnd

Randomize

sRnd = Int(900 * Rnd) + 100

sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & ".torrent"

url = "torrent/" & year(p) & "-" & month(p) & "-" & day(p) & "/" & sFileName

Call CreateF(url)

'Text

Response.Write classname & "<br>"

Response.write title & "<br>"

'response.Write intro & "<br>"

'response.Write addtime & "<br>"

'response.Write username & "<br>"

'response.Write filesize & "<br>"

response.Write downurl & "<br>"

response.Write url & "<br>"

response.Flush()

'response.End()

'database

if err.number = 0 then

if (Not IsNull(title)) and title <> "" and downurl <> "" then

set rs = server.CreateObject("adodb.recordset")

sql = "select * from bt_class where classname = '" & classname & "'"

rs.open sql,conn,1,3

if rs.eof then

rs.addnew

rs("classname") = classname

rs.update

end if

classid = rs("classid")

rs.close

set rs = nothing

set rs = server.CreateObject("adodb.recordset")

sql = "select * from bt_movie where title in ('" & title & "')"

rs.open sql,conn,1,3

if rs.eof then

response.Write "<div><font color=blue>写入数据库...</font></div>"

response.Flush()

rs.addnew

rs("classid") = classid

rs("title") = title

rs("title2") = title2

rs("intro") = intro

rs("username") = username

rs("filesize") = filesize

rs("url") = url

rs("serverid") = 1

rs("addtime") = addtime

rs("ismake") = 0

rs.update

objxhttp.URL = downurl

objxhttp.saveimage url,False

else

response.Write "<div><font color=red>已经存在!</font></div>"

end if

rs.close

set rs = nothing

'objxhttp.URL = downurl

'objxhttp.saveimage url,False

End IF

Else

err.clear

End IF

response.Write "-------------------------------------------<br>"

Next

set regEx = nothing

response.Write "下一页<br>"

response.Flush()

Call NextID()

end if

Sub NextID

conn.close

set conn = nothing

if cint(startid) < cint(overid) and cint(id) < cint(overid) then

response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id + 1 &"'</script>"

Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then

response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id - 1 &"'</script>"

Else

Response.Write "采集完成!<br>"

response.End()

End if

End Sub

%>

</body>

</html>

【自己做采集程序】相关文章:

巧用缓存提高asp程序的性能

在ASP处理程序时显示进度

asp中实现随机分组程序的代码

asp制作的日历程序

access 数据连接程序

ASP万用分页程序

小偷,采集程序常用函数

ASP下实现自动采集程序及入库的代码

经典的分页完整程序+注释

小偷程序2

精品推荐
分类导航