java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错
java 抓取 https 网页爬虫,解决 Server returned HTTP response code: 403 for URL 报错
关键是在忽略 https 的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");
注意:需要加在 new BufferedReader 前面才行,否则无效。
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection();
</span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span>
connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">);
</span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader(
connection.getInputStream()));</span></pre>
抓取的地方先调用忽略 https 的代码
//先调用下忽略 https 证书的再请求才可以 HttpsUrlValidator.retrieveResponseFromServer(url);doc </span>=<span style="color: rgba(0, 0, 0, 1)"> Jsoup .connect(url) .header(</span>"User-Agent",rand_agents)</pre>
完整的 HttpsUrlValidator.java 代码如下:
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL;import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;public class HttpsUrlValidator {
</span><span style="color: rgba(0, 0, 255, 1)">static</span> HostnameVerifier hv = <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> HostnameVerifier() { </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> verify(String urlHostName, SSLSession session) { System.out.println(</span>"Warning: URL Host: " + urlHostName + " vs. " +<span style="color: rgba(0, 0, 0, 1)"> session.getPeerHost()); </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">; } }; </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">final</span> <span style="color: rgba(0, 0, 255, 1)">static</span> String retrieveResponseFromServer(<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> String url) { HttpURLConnection connection </span>= <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">; </span><span style="color: rgba(0, 0, 255, 1)">try</span><span style="color: rgba(0, 0, 0, 1)"> { URL validationUrl </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> URL(url); trustAllHttpsCertificates(); HttpsURLConnection.setDefaultHostnameVerifier(hv); connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection(); </span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span> connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">); </span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader( connection.getInputStream())); String line; </span><span style="color: rgba(0, 0, 255, 1)">final</span> StringBuffer stringBuffer = <span style="color: rgba(0, 0, 255, 1)">new</span> StringBuffer(255<span style="color: rgba(0, 0, 0, 1)">); </span><span style="color: rgba(0, 0, 255, 1)">synchronized</span><span style="color: rgba(0, 0, 0, 1)"> (stringBuffer) { </span><span style="color: rgba(0, 0, 255, 1)">while</span> ((line = in.readLine()) != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) { stringBuffer.append(line); stringBuffer.append(</span>"\n"<span style="color: rgba(0, 0, 0, 1)">); } </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> stringBuffer.toString(); } } </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> IOException e) { System.out.println(e.getMessage()); </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">; } </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> Exception e1){ System.out.println(e1.getMessage()); </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">; }</span><span style="color: rgba(0, 0, 255, 1)">finally</span><span style="color: rgba(0, 0, 0, 1)"> { </span><span style="color: rgba(0, 0, 255, 1)">if</span> (connection != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) { connection.disconnect(); } } } </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">void</span> trustAllHttpsCertificates() <span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> Exception { javax.net.ssl.TrustManager[] trustAllCerts </span>= <span style="color: rgba(0, 0, 255, 1)">new</span> javax.net.ssl.TrustManager[1<span style="color: rgba(0, 0, 0, 1)">]; javax.net.ssl.TrustManager tm </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> miTM(); trustAllCerts[</span>0] =<span style="color: rgba(0, 0, 0, 1)"> tm; javax.net.ssl.SSLContext sc </span>=<span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.SSLContext .getInstance(</span>"SSL"<span style="color: rgba(0, 0, 0, 1)">); sc.init(</span><span style="color: rgba(0, 0, 255, 1)">null</span>, trustAllCerts, <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">); javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc .getSocketFactory()); } </span><span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">class</span> miTM <span style="color: rgba(0, 0, 255, 1)">implements</span><span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager { </span><span style="color: rgba(0, 0, 255, 1)">public</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.X509Certificate[] getAcceptedIssuers() { </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">; } </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isServerTrusted( java.security.cert.X509Certificate[] certs) { </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">; } </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isClientTrusted( java.security.cert.X509Certificate[] certs) { </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">; } </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkServerTrusted( java.security.cert.X509Certificate[] certs, String authType) </span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException { </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">; } </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkClientTrusted( java.security.cert.X509Certificate[] certs, String authType) </span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException { </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">; } }
}