第一个问题:可以用Apache的HttpClient库来请求网页的HTML内容,然后通过HTML解析库比如Jsoup来获取Html中的元素。
第二个问题:可以用无头浏览器(Headless Browser),它可以模拟浏览器执行Js,来获取并生成html。
如果给一个url地址,可以通过以下代码获取该地址的html,并对html页面的元素进行操作:
String url = "https://mobile.yangkeduo.com/goods.html?goods_id=484209795384";
try {
// 1.创建URL对象和连接对象
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
// 2.设置请求方法,获取响应代码和内容
connection.setRequestMethod("GET");
int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
InputStream inputStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
StringBuilder htmlContent = new StringBuilder();
while ((line = reader.readLine()) != null) {
htmlContent.append(line);
}
reader.close();
inputStream.close();
//3.使用 Jsoup 解析 HTML
Document document = Jsoup.parse(htmlContent.toString());
//4.操作html中的a元素
Elements links = document.select("a");
for (Element link : links) {
String linkText = link.text();
String linkUrl = link.attr("href");
}
} else {
System.out.println("HTTP request failed with response code: " + responseCode);
}
connection.disconnect();
} catch (IOException e) {
e.printStackTrace();
}