java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错

java 抓取 https 网页爬虫,解决 Server returned HTTP response code: 403 for URL 报错

关键是在忽略 https 的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");

注意:需要加在 new BufferedReader 前面才行,否则无效。

HttpsURLConnection.setDefaultHostnameVerifier(hv);
        connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection();
        </span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span>
        connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">);
        
        </span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader(
                connection.getInputStream()));</span></pre>

抓取的地方先调用忽略 https 的代码

//先调用下忽略 https 证书的再请求才可以
            HttpsUrlValidator.retrieveResponseFromServer(url);
        doc </span>=<span style="color: rgba(0, 0, 0, 1)"> Jsoup
                .connect(url)
                .header(</span>"User-Agent",rand_agents)</pre>

完整的 HttpsUrlValidator.java 代码如下:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;

public class HttpsUrlValidator {

</span><span style="color: rgba(0, 0, 255, 1)">static</span> HostnameVerifier hv = <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> HostnameVerifier() {
    </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> verify(String urlHostName, SSLSession session) {
        System.out.println(</span>"Warning: URL Host: " + urlHostName + " vs. "
                           +<span style="color: rgba(0, 0, 0, 1)"> session.getPeerHost());
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
    }
};

</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">final</span> <span style="color: rgba(0, 0, 255, 1)">static</span> String retrieveResponseFromServer(<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> String url) {
    HttpURLConnection connection </span>= <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
    
    </span><span style="color: rgba(0, 0, 255, 1)">try</span><span style="color: rgba(0, 0, 0, 1)"> {
        URL validationUrl </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> URL(url);
        trustAllHttpsCertificates();
        HttpsURLConnection.setDefaultHostnameVerifier(hv);

        connection </span>=<span style="color: rgba(0, 0, 0, 1)"> (HttpURLConnection) validationUrl.openConnection();
        </span><span style="color: rgba(0, 128, 0, 1)">//</span><span style="color: rgba(0, 128, 0, 1)">first set User-Agent to solve Server returned HTTP response code: 403 for URL</span>
        connection.setRequestProperty("User-Agent", "Mozilla/4.76"<span style="color: rgba(0, 0, 0, 1)">);
        
        </span><span style="color: rgba(0, 0, 255, 1)">final</span> BufferedReader in = <span style="color: rgba(0, 0, 255, 1)">new</span> BufferedReader(<span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> InputStreamReader(
                connection.getInputStream()));
        
        String line;
        </span><span style="color: rgba(0, 0, 255, 1)">final</span> StringBuffer stringBuffer = <span style="color: rgba(0, 0, 255, 1)">new</span> StringBuffer(255<span style="color: rgba(0, 0, 0, 1)">);

        </span><span style="color: rgba(0, 0, 255, 1)">synchronized</span><span style="color: rgba(0, 0, 0, 1)"> (stringBuffer) {
            </span><span style="color: rgba(0, 0, 255, 1)">while</span> ((line = in.readLine()) != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) {
                stringBuffer.append(line);
                stringBuffer.append(</span>"\n"<span style="color: rgba(0, 0, 0, 1)">);
            }
            </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> stringBuffer.toString();
        }

    } </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> IOException e) {
        System.out.println(e.getMessage());
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
    } </span><span style="color: rgba(0, 0, 255, 1)">catch</span> (<span style="color: rgba(0, 0, 255, 1)">final</span><span style="color: rgba(0, 0, 0, 1)"> Exception e1){
        System.out.println(e1.getMessage());
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
    }</span><span style="color: rgba(0, 0, 255, 1)">finally</span><span style="color: rgba(0, 0, 0, 1)"> {
        </span><span style="color: rgba(0, 0, 255, 1)">if</span> (connection != <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">) {
            connection.disconnect();
        }
    }
}

</span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">void</span> trustAllHttpsCertificates() <span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> Exception {
    javax.net.ssl.TrustManager[] trustAllCerts </span>= <span style="color: rgba(0, 0, 255, 1)">new</span> javax.net.ssl.TrustManager[1<span style="color: rgba(0, 0, 0, 1)">];
    javax.net.ssl.TrustManager tm </span>= <span style="color: rgba(0, 0, 255, 1)">new</span><span style="color: rgba(0, 0, 0, 1)"> miTM();
    trustAllCerts[</span>0] =<span style="color: rgba(0, 0, 0, 1)"> tm;
    javax.net.ssl.SSLContext sc </span>=<span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.SSLContext
            .getInstance(</span>"SSL"<span style="color: rgba(0, 0, 0, 1)">);
    sc.init(</span><span style="color: rgba(0, 0, 255, 1)">null</span>, trustAllCerts, <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">);
    javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
            .getSocketFactory());
}

</span><span style="color: rgba(0, 0, 255, 1)">static</span> <span style="color: rgba(0, 0, 255, 1)">class</span> miTM <span style="color: rgba(0, 0, 255, 1)">implements</span><span style="color: rgba(0, 0, 0, 1)"> javax.net.ssl.TrustManager,
        javax.net.ssl.X509TrustManager {
    </span><span style="color: rgba(0, 0, 255, 1)">public</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.X509Certificate[] getAcceptedIssuers() {
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">null</span><span style="color: rgba(0, 0, 0, 1)">;
    }

    </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isServerTrusted(
            java.security.cert.X509Certificate[] certs) {
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
    }

    </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">boolean</span><span style="color: rgba(0, 0, 0, 1)"> isClientTrusted(
            java.security.cert.X509Certificate[] certs) {
        </span><span style="color: rgba(0, 0, 255, 1)">return</span> <span style="color: rgba(0, 0, 255, 1)">true</span><span style="color: rgba(0, 0, 0, 1)">;
    }

    </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkServerTrusted(
            java.security.cert.X509Certificate[] certs, String authType)
            </span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException {
        </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">;
    }

    </span><span style="color: rgba(0, 0, 255, 1)">public</span> <span style="color: rgba(0, 0, 255, 1)">void</span><span style="color: rgba(0, 0, 0, 1)"> checkClientTrusted(
            java.security.cert.X509Certificate[] certs, String authType)
            </span><span style="color: rgba(0, 0, 255, 1)">throws</span><span style="color: rgba(0, 0, 0, 1)"> java.security.cert.CertificateException {
        </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)">;
    }
}

}