共计 1902 个字符,预计需要花费 5 分钟才能阅读完成。
download:Hadoop 零碎入门 + 外围精讲
package com.zzger.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import com.zzger.module.queue.UrlQueue;
import com.zzger.util.HttpUtils;
import com.zzger.util.RegexUtils;
public class WebSite {
/**
* 站点 url
*/
private String url;
/**
* 需要爬行的 url 队列
*/
private UrlQueue urls = new UrlQueue<>();
/**
* 已爬行过的页面 url
*/
private List exitUrls = Collections.synchronizedList(new ArrayList<>());
private static final int TOTAL_THREADS = 12;
private final CountDownLatch mStartSignal = new CountDownLatch(1);
private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS);
public WebSite(String url){
this.url = url;
urls.offer(url);// 把网站首页加入需要爬行的队列中
}
public void guangDu(){new Thread(new Runnable() {
@Override
public void run() {paxing(HttpUtils.httpGet(url));
}
}).start();}
public void paxing(String html){if(html.lastIndexOf(" 下一页
“)<0) return ;
String strList = html.substring(html.indexOf("<li class=\\"next-page\\">"),
html.lastIndexOf(" 下一页
“));
String url = RegexUtils.RegexString("<a href=\\"(.+?)\\"", strList);
if(url.equals("Nothing")) return ;
urls.put(url);// 把 url 存储到队列中
paxing(HttpUtils.httpGet(url));
}
public void dxcPx(){Page page = new Gxpage(urls.take());
List<Section> list = page.ybhqSection().getSections();
for(Section section : list){new Thread(new Runnable() {
@Override
public void run() {mStartSignal.countDown();// 计数减一为 0,工作线程真正启动具体操作
try {mStartSignal.await();// 阻塞,等待 mStartSignal 计数为 0 运行前面的代码
// 所有的工作线程都在等待同一个启动的命令
} catch (InterruptedException e) {e.printStackTrace();
}
DuanZi duanzi = section.select().getModel();
System.out.println(duanzi.getTitle());
mDoneSignal.countDown();// 实现当前计数减一}
}
).start();}
try
{mDoneSignal.await();// 等待所有工作线程结束
}
catch (InterruptedException e)
{e.printStackTrace();
}
dxcPx();// 线程工作执行完后,再次获取 url 队列进行工作}
public static void main(String[] args) {WebSite web = new WebSite("http://duanziwang.com");
web.guangDu();
for(int i = 0; i<10;i++){new Thread(new Runnable() {
@Override
public void run() {web.dxcPx();
}
}).start();}
}
}
正文完