目标
爬取搜狗图片上千张美女图片并下载到本地
筹备工作
爬取地址:https://pic.sogou.com/pics?qu…
剖析
关上下面的地址,按 F12 开发者工具 – NetWork – XHR – 页面往下滑动 XHR 栏呈现申请信息如下:
Request URL:https://pic.sogou.com/napi/pc…
剖析这段申请 URL 的次要几个参数:
start=48 示意从第 48 张图片开始检索
xml_len=48 从地 48 张往后获取 48 张图片
query=?搜寻关键词(例:美女,这里浏览器主动做了转码,不影响咱们应用)
点击 Respose,找个 JSON 格局器辅助过来看看。
JSON 格局:https://www.bejson.com/
剖析 Respose 返回的信息,能够发现咱们想要的图片地址放在 picUrl 里,
思路
通过以上剖析,不难实现下载办法,思路如下:
- 设置 URL 申请参数
- 拜访 URL 申请,获取图片地址
- 图片地址存入 List
- 遍历 List,应用线程池下载到本地
代码
SougouImgProcessor.java 爬取图片类
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
/**
* A simple PageProcessor.
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SougouImgProcessor {
private String url;
private SougouImgPipeline pipeline;
private List<JSONObject> dataList;
private List<String> urlList;
private String word;
public SougouImgProcessor(String url,String word) {
this.url = url;
this.word = word;
this.pipeline = new SougouImgPipeline();
this.dataList = new ArrayList<>();
this.urlList = new ArrayList<>();}
public void process(int idx, int size) {String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
JSONObject object = JSONObject.parseObject(res);
List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
for(JSONObject item : items){this.urlList.add(item.getString("picUrl"));
}
this.dataList.addAll(items);
}
// 下载
public void pipelineData(){
// 多线程
pipeline.processSync(this.urlList, this.word);
}
public static void main(String[] args) {
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
SougouImgProcessor processor = new SougouImgProcessor(url,"美女");
int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量
for(int i=start;i<start+limit;i+=size)
processor.process(i, size);
processor.pipelineData();}
}
SougouImgPipeline.java 图片下载类
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store results in files.<br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SougouImgPipeline {
private String extension = ".jpg";
private String path;
private volatile AtomicInteger suc;
private volatile AtomicInteger fails;
public SougouImgPipeline() {setPath("E:/pipeline/sougou");
suc = new AtomicInteger();
fails = new AtomicInteger();}
public SougouImgPipeline(String path) {setPath(path);
suc = new AtomicInteger();
fails = new AtomicInteger();}
public SougouImgPipeline(String path, String extension) {setPath(path);
this.extension = extension;
suc = new AtomicInteger();
fails = new AtomicInteger();}
public void setPath(String path) {this.path = path;}
/**
* 下载
* @param url
* @param cate
* @throws Exception
*/
private void downloadImg(String url, String cate, String name) throws Exception {
String path = this.path + "/" + cate + "/";
File dir = new File(path);
if (!dir.exists()) { // 目录不存在则创立目录
dir.mkdirs();}
String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名
String fileName = name + realExt;
fileName = fileName.replace("-", "");
String filePath = path + fileName;
File img = new File(filePath);
if(img.exists()){ // 若文件之前曾经下载过,则跳过
System.out.println(String.format("文件 %s 已存在本地目录",fileName));
return;
}
URLConnection con = new URL(url).openConnection();
con.setConnectTimeout(5000);
con.setReadTimeout(5000);
InputStream inputStream = con.getInputStream();
byte[] bs = new byte[1024];
File file = new File(filePath);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取 写入
int len;
while ((len = inputStream.read(bs)) != -1) {os.write(bs, 0, len);
}
System.out.println("picUrl:" + url);
System.out.println(String.format("正在下载第 %s 张图片", suc.getAndIncrement()));
}
/**
* 单线程解决
*
* @param data
* @param word
*/
public void process(List<String> data, String word) {long start = System.currentTimeMillis();
for (String picUrl : data) {if (picUrl == null)
continue;
try {downloadImg(picUrl, word, picUrl);
} catch (Exception e) {fails.incrementAndGet();
}
}
System.out.println("下载胜利:" + suc.get());
System.out.println("下载失败:" + fails.get());
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000 + "秒");
}
/**
* 多线程解决
*
* @param data
* @param word
*/
public void processSync(List<String> data, String word) {long start = System.currentTimeMillis();
int count = 0;
ExecutorService executorService = Executors.newCachedThreadPool(); // 创立缓存线程池
for (int i=0;i<data.size();i++) {String picUrl = data.get(i);
if (picUrl == null)
continue;
String name = "";
if(i<10){name="000"+i;}else if(i<100){name="00"+i;}else if(i<1000){name="0"+i;}
String finalName = name;
executorService.execute(() -> {
try {downloadImg(picUrl, word, finalName);
} catch (Exception e) {fails.incrementAndGet();
}
});
count++;
}
executorService.shutdown();
try {if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {// 超时的时候向线程池中所有的线程收回中断 (interrupted)。// executorService.shutdownNow();}
System.out.println("AwaitTermination Finished");
System.out.println("共有 URL:"+data.size());
System.out.println("下载胜利:" + suc);
System.out.println("下载失败:" + fails);
File dir = new File(this.path + "/" + word + "/");
int len = Objects.requireNonNull(dir.list()).length;
System.out.println("以后共有文件:"+len);
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000.0 + "秒");
} catch (InterruptedException e) {e.printStackTrace();
}
}
/**
* 多线程分段解决
*
* @param data
* @param word
* @param threadNum
*/
public void processSync2(List<String> data, final String word, int threadNum) {if (data.size() < threadNum) {process(data, word);
} else {ExecutorService executorService = Executors.newCachedThreadPool();
int num = data.size() / threadNum; // 每段要解决的数量
for (int i = 0; i < threadNum; i++) {
int start = i * num;
int end = (i + 1) * num;
if (i == threadNum - 1) {end = data.size();
}
final List<String> cutList = data.subList(start, end);
executorService.execute(() -> process(cutList, word));
}
executorService.shutdown();}
}
}
HttpClientUtils.java http 申请工具类
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/27
*/
public abstract class HttpClientUtils {public static Map<String, List<String>> convertHeaders(Header[] headers) {Map<String, List<String>> results = new HashMap<String, List<String>>();
for (Header header : headers) {List<String> list = results.get(header.getName());
if (list == null) {list = new ArrayList<String>();
results.put(header.getName(), list);
}
list.add(header.getValue());
}
return results;
}
/**
* http 的 get 申请
* @param url
*/
public static String get(String url) {return get(url, "UTF-8");
}
public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);
/**
* http 的 get 申请
* @param url
*/
public static String get(String url, String charset) {HttpGet httpGet = new HttpGet(url);
return executeRequest(httpGet, charset);
}
/**
* http 的 get 申请,减少异步申请头参数
* @param url
*/
public static String ajaxGet(String url) {return ajaxGet(url, "UTF-8");
}
/**
* http 的 get 申请,减少异步申请头参数
*
* @param url
*/
public static String ajaxGet(String url, String charset) {HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
return executeRequest(httpGet, charset);
}
/**
* @param url
* @return
*/
public static String ajaxGet(CloseableHttpClient httpclient, String url) {HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
return executeRequest(httpclient, httpGet, "UTF-8");
}
/**
* http 的 post 申请,传递 map 格局参数
*/
public static String post(String url, Map<String, String> dataMap) {return post(url, dataMap, "UTF-8");
}
/**
* http 的 post 申请,传递 map 格局参数
*/
public static String post(String url, Map<String, String> dataMap, String charset) {HttpPost httpPost = new HttpPost(url);
try {if (dataMap != null) {List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
} catch (UnsupportedEncodingException e) {e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
/**
* http 的 post 申请,减少异步申请头参数,传递 map 格局参数
*/
public static String ajaxPost(String url, Map<String, String> dataMap) {return ajaxPost(url, dataMap, "UTF-8");
}
/**
* http 的 post 申请,减少异步申请头参数,传递 map 格局参数
*/
public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
try {if (dataMap != null) {List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
} catch (UnsupportedEncodingException e) {e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
/**
* http 的 post 申请,减少异步申请头参数,传递 json 格局参数
*/
public static String ajaxPostJson(String url, String jsonString) {return ajaxPostJson(url, jsonString, "UTF-8");
}
/**
* http 的 post 申请,减少异步申请头参数,传递 json 格局参数
*/
public static String ajaxPostJson(String url, String jsonString, String charset) {HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
StringEntity stringEntity = new StringEntity(jsonString, charset);// 解决中文乱码问题
stringEntity.setContentEncoding(charset);
stringEntity.setContentType("application/json");
httpPost.setEntity(stringEntity);
return executeRequest(httpPost, charset);
}
/**
* 执行一个 http 申请,传递 HttpGet 或 HttpPost 参数
*/
public static String executeRequest(HttpUriRequest httpRequest) {return executeRequest(httpRequest, "UTF-8");
}
/**
* 执行一个 http 申请,传递 HttpGet 或 HttpPost 参数
*/
public static String executeRequest(HttpUriRequest httpRequest, String charset) {
CloseableHttpClient httpclient;
if ("https".equals(httpRequest.getURI().getScheme())) {httpclient = createSSLInsecureClient();
} else {httpclient = HttpClients.createDefault();
}
String result = "";
try {
try {CloseableHttpResponse response = httpclient.execute(httpRequest);
HttpEntity entity = null;
try {entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
} finally {EntityUtils.consume(entity);
response.close();}
} finally {httpclient.close();
}
} catch (IOException ex) {ex.printStackTrace();
}
return result;
}
public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {
String result = "";
try {
try {CloseableHttpResponse response = httpclient.execute(httpRequest);
HttpEntity entity = null;
try {entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
} finally {EntityUtils.consume(entity);
response.close();}
} finally {httpclient.close();
}
} catch (IOException ex) {ex.printStackTrace();
}
return result;
}
/**
* 创立 SSL 连贯
*/
public static CloseableHttpClient createSSLInsecureClient() {
try {SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {return true;}
}).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {
@Override
public boolean verify(String hostname, SSLSession session) {return true;}
});
return HttpClients.custom().setSSLSocketFactory(sslsf).build();} catch (GeneralSecurityException ex) {throw new RuntimeException(ex);
}
}
}
运行
因为网络等起因,咱们发现并不能全副下载胜利,不过能够屡次运行尝试,能够实现较高的下载成功率。
666,厉害了。。
本文链接:https://blog.csdn.net/qq_3540…
近期热文举荐:
1.1,000+ 道 Java 面试题及答案整顿 (2021 最新版)
2. 别在再满屏的 if/ else 了,试试策略模式,真香!!
3. 卧槽!Java 中的 xx ≠ null 是什么新语法?
4.Spring Boot 2.5 重磅公布,光明模式太炸了!
5.《Java 开发手册(嵩山版)》最新公布,速速下载!
感觉不错,别忘了顺手点赞 + 转发哦!