java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错
java 抓取 https 网页爬虫,解决 Server returned HTTP response code: 403 for URL 报错
关键是在忽略 https 的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");
注意:需要加在 new BufferedReader 前面才行,否则无效。
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection();
</span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span>
connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">);
</span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader(
connection.getInputStream()));</span></pre>
抓取的地方先调用忽略 https 的代码
//先调用下忽略 https 证书的再请求才可以
HttpsUrlValidator.retrieveResponseFromServer(url);
doc </span>=<span style="color: rgba(0, 0, 0, 1)"> Jsoup
.connect(url)
.header(</span>"User-Agent",rand_agents)</pre>
完整的 HttpsUrlValidator.java 代码如下:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;
public class HttpsUrlValidator {
</span><span style="color: rgba(0, 0, 255, 1)">static</span> HostnameVerifier hv = <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> HostnameVerifier() {
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> verify(String urlHostName, SSLSession session) {
System.out.println(</span>"Warning: URL Host: " + urlHostName + " vs. "
+<span style="color: rgba(0, 0, 0, 1)"> session.getPeerHost());
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
}
};
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">final</span> <span style="color: rgba(0, 0, 255, 1)">static</span> String retrieveResponseFromServer(<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> String url) {
HttpURLConnection connection </span>= <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
</span><span style="color: rgba(0, 0, 255, 1)">try</span><span style="color: rgba(0, 0, 0, 1)"> {
URL validationUrl </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> URL(url);
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection();
</span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span>
connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">);
</span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader(
connection.getInputStream()));
String line;
</span><span style="color: rgba(0, 0, 255, 1)">final</span> StringBuffer stringBuffer = <span style="color: rgba(0, 0, 255, 1)">new</span> StringBuffer(255<span style="color: rgba(0, 0, 0, 1)">);
</span><span style="color: rgba(0, 0, 255, 1)">synchronized</span><span style="color: rgba(0, 0, 0, 1)"> (stringBuffer) {
</span><span style="color: rgba(0, 0, 255, 1)">while</span> ((line = in.readLine()) != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) {
stringBuffer.append(line);
stringBuffer.append(</span>"\n"<span style="color: rgba(0, 0, 0, 1)">);
}
</span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> stringBuffer.toString();
}
} </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> IOException e) {
System.out.println(e.getMessage());
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
} </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> Exception e1){
System.out.println(e1.getMessage());
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
}</span><span style="color: rgba(0, 0, 255, 1)">finally</span><span style="color: rgba(0, 0, 0, 1)"> {
</span><span style="color: rgba(0, 0, 255, 1)">if</span> (connection != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) {
connection.disconnect();
}
}
}
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">void</span> trustAllHttpsCertificates() <span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> Exception {
javax.net.ssl.TrustManager[] trustAllCerts </span>= <span style="color: rgba(0, 0, 255, 1)">new</span> javax.net.ssl.TrustManager[1<span style="color: rgba(0, 0, 0, 1)">];
javax.net.ssl.TrustManager tm </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> miTM();
trustAllCerts[</span>0] =<span style="color: rgba(0, 0, 0, 1)"> tm;
javax.net.ssl.SSLContext sc </span>=<span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.SSLContext
.getInstance(</span>"SSL"<span style="color: rgba(0, 0, 0, 1)">);
sc.init(</span><span style="color: rgba(0, 0, 255, 1)">null</span>, trustAllCerts, <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
.getSocketFactory());
}
</span><span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">class</span> miTM <span style="color: rgba(0, 0, 255, 1)">implements</span><span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.TrustManager,
javax.net.ssl.X509TrustManager {
</span><span style="color: rgba(0, 0, 255, 1)">public</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.X509Certificate[] getAcceptedIssuers() {
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
}
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isServerTrusted(
java.security.cert.X509Certificate[] certs) {
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
}
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isClientTrusted(
java.security.cert.X509Certificate[] certs) {
</span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
}
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
</span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException {
</span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">;
}
</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
</span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException {
</span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">;
}
}
}