2007-10-12
头文字D,Spider
关键字: 爬虫
贴一段我用D写的Spider的代码。
越来越爱D了。
越来越爱D了。
import std.stdio;
import std.string;
import std.conv;
import std.socket;
import std.socketstream;
import std.stream;
import std.regexp;
import std.thread;
import std.c.time;
char[][] g_queue;
int g_task_amount=0;
const char[] homepage="http://mobile.younet.com/";
const ushort max_thread=20;
alias std.string.find strfind;
char[] getHTML(char[] url){
char[] domain,html;
ubyte[1024] buf;
ushort port=80;
if(!isURL(url))return null;
int i=strfind(url,"://")+3;
url=url[i..$];
int j=strfind(url,":");
int e=strfind(url,"/");
if(e<0){
e=url.length;
}
if(j>0){
port=toUshort(url[j+1..e]);
domain=url[0..j];
}
else{
domain=url[0..e];
}
if(e==url.length){
url="/";
}
else{
url=url[e..$];
}
debug(younet){
writefln(toString(port) ~" "~ domain ~" "~ url);
}
Socket sock=new TcpSocket(new InternetAddress(domain,port));
Stream ss=new SocketStream(sock);
ss.writeString("GET " ~ url ~ " HTTP/1.0\r\n"
"Host: " ~ domain ~ "\r\n"
"Connection: close\r\n"
"Referer: http://" ~ domain ~ url ~ "\r\n"
"\r\n\r\n\r\n\r\n");
int recv_amount=ss.read(buf);
while(recv_amount>0){
html ~= cast(char[])buf[0..recv_amount];
recv_amount=ss.read(buf);
}
ss.close();
sock.close();
char[][] mc=RegExp("(URL=|Location: )(.*?)[\"\r]").match(html);
if(mc.length==3){
char[] new_location=mc[2];
html=getHTML(new_location);
return html;
}
int start_pos=strfind(html,"\r\n\r\n") ;
html=html[start_pos+4 .. $];
return html[0..$];
}
int crawl(void * p){
while(true){
char[] url,html;
synchronized{
if(g_queue.length==0)
sleep(1);
if(g_queue.length==0)
break;
url=g_queue[0];
writefln("begin:" ~ url);
if(g_queue.length>0)
g_queue=g_queue[1..$];
}
try{
html=getHTML(url);
}
catch(Exception ex){
synchronized{
if(g_task_amount>0)
g_task_amount-=1;
}
writefln(ex);
writefln("failed:" ~ url);
writefln("remains" ~ toString(g_task_amount));
continue;
}
debug(younet){
printf(toStringz("!!!" ~ html[0..200]));
}
if(strfind(url,"files/list")<0){
synchronized{
foreach(m;RegExp("files/list_\\d+\\.html").search(html)){
g_queue ~= homepage ~ m.match(0);
g_task_amount+=1;
}
g_task_amount-=1;
}
writefln("done:" ~ url);
debug(younet){
writefln(g_queue);
}
}
else{
writefln("done:" ~ url);
synchronized{
g_task_amount-=1;
writefln("remains" ~ toString(g_task_amount));
}
}
}
return 1;
}
int main(char[][] args){
//writefln("Hello");
g_queue ~= homepage;
g_task_amount+=1;
Thread[] tds;
for(int i=0;i<max_thread;i++){
Thread t=new Thread(&crawl,null);
t.start();
tds ~= t;
}
sleep(5);
while(true){
sleep(1);
if(g_task_amount<=0)break;
}
return 0;
}
发表评论
提醒: 该博客已发表在公共论坛,博客所有留言会成为论坛回贴,留言请注意遵守论坛发贴规则
- 浏览: 1693 次
- 性别:

- 来自: 北京

- 详细资料
搜索本博客
最近加入圈子
链接
最新评论
-
LockMode.UPGRADE 引发的 ...
所谓的LockMode.UPGRADE就是oracle里面的select .. ...
-- by Saro -
LockMode.UPGRADE 引发的 ...
请问当Lock为Write时抛出什么异常?
-- by popduke -
LockMode.UPGRADE 引发的 ...
wang19841229 写道这个地方为什么不能使用乐观锁,我觉得也是可以的。 ...
-- by fxsjy -
LockMode.UPGRADE 引发的 ...
这个地方为什么不能使用乐观锁,我觉得也是可以的。
-- by wang19841229 -
LockMode.UPGRADE 引发的 ...
对于web应用程序运用悲观锁之后,何时释放锁怎么释放锁是个难题。
-- by oldrock






评论排行榜