java代码从网络下载html处理

it2024-11-08  16

从html中过滤image类型的地址

工具类okhttp的依赖 <dependency> <groupId>com.squareup.okhttp3</groupId> <artifactId>okhttp</artifactId> <version>4.4.0</version> </dependency> 工具类okhttp package com.hengqin.life.common.util; import lombok.extern.slf4j.Slf4j; import okhttp3.*; import javax.net.ssl.*; import java.io.IOException; import java.io.InputStream; import java.security.SecureRandom; import java.security.cert.X509Certificate; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.concurrent.TimeUnit; /** * OkHttp3工具类 * * @author lzj10 * @create 2020-10-10 10:20 */ @Slf4j public class OkHttpUtils { public static final MediaType JSON = MediaType.parse("application/json; charset=utf-8"); public static final MediaType XML = MediaType.parse("application/xml"); private OkHttpUtils() { throw new IllegalStateException("请不要对我实例化!"); } /** * get请求 * * @param url url地址 */ public static String get(final String url) throws IOException { return get(url, null, null); } /** * get请求带参数 * * @param url url地址 * @param headerMap 请求头map * @param paramMap 请求参数map */ public static String get(String url, Map<String, String> headerMap, Map<String, String> paramMap) throws IOException { return Objects.requireNonNull(getToResponse(url, headerMap, paramMap).body()).string(); } /** * get获取流 * * @param url url地址 */ public static InputStream getToInputStream(final String url) throws IOException { return getToInputStreamByMap(url, null, null); } /** * get获取流 带参数 * * @param url url地址 * @param headerMap 请求头map * @param paramMap 请求参数map */ public static InputStream getToInputStreamByMap(String url, Map<String, String> headerMap, Map<String, String> paramMap) throws IOException { return Objects.requireNonNull(getToResponse(url, headerMap, paramMap).body()).byteStream(); } /** * get公用方法 获取返回体 带参数 * * @param url url地址 * @param headerMap 请求头map * @param paramMap 请求参数map */ public static Response getToResponse(String url, Map<String, String> headerMap, Map<String, String> paramMap) throws IOException { if (paramMap != null && !paramMap.isEmpty()) { url = url + "?" + map2String(paramMap); } Request request = new Request.Builder() .url(url) .build(); if (headerMap != null && !headerMap.isEmpty()) { request = getNewRequest(request, headerMap); } OkHttpClient client = new OkHttpClient.Builder() .retryOnConnectionFailure(false) // .addInterceptor(new LogInterceptor()) // .sslSocketFactory(createSSLSocketFactory(), new OkHttpUtils.TrustAllManager()) // .hostnameVerifier(new OkHttpUtils.TrustAllHostnameVerifier()) .build(); return client.newCall(request).execute(); } /** * json格式post请求接口调用 * * @param url 接口地址 * @param requestMessage json格式请求参数体 */ public static String postJSON(final String url, final String requestMessage) throws IOException { return postJSON(url, requestMessage, null); } /** * json格式post请求接口调用 带请求头 * * @param url 接口地址 * @param requestMessage json格式请求参数体 * @param headerMap map格式请求头参数体 */ public static String postJSON(final String url, String requestMessage, Map<String, String> headerMap) throws IOException { return postJSON(url, requestMessage, headerMap,30); } /** * json格式post请求接口调用 带请求头 * * @param url 接口地址 * @param requestMessage json格式请求参数体 * @param headerMap map格式请求头参数体 */ public static String postJSON(final String url, String requestMessage, Map<String, String> headerMap,final int timeout) throws IOException { return post(timeout, TimeUnit.SECONDS, JSON, url, requestMessage, headerMap); } /** * xml格式post请求接口调用 * * @param url 接口地址 * @param requestMessage xml格式请求参数体 */ public static String postXML(final String url, final String requestMessage) throws IOException { return postXML(url, requestMessage, null); } /** * xml格式post请求接口调用 带请求头 * * @param url 接口地址 * @param requestMessage json格式请求参数体 * @param headerMap map格式请求头参数体 */ public static String postXML(final String url, String requestMessage, Map<String, String> headerMap) throws IOException { return post(30, TimeUnit.SECONDS, XML, url, requestMessage, headerMap); } /** * json格式post请求 获取响应参数 * * @param url 接口地址 * @param requestMessage json格式请求参数体 */ public static Response postToResponseByJSON(final String url, final String requestMessage) throws IOException { return postToResponse(30, TimeUnit.SECONDS, JSON, url, requestMessage, null); } /** * post公用方法 带参数 * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param mediaType 类型 * @param url url 地址 * @param requestMessage 请求的String格式参数体 * @param headerMap 请求头的map格式参数体 */ public static String post(final int timeout, final TimeUnit timeUnit, final MediaType mediaType, final String url, final String requestMessage, Map<String, String> headerMap) throws IOException { return Objects.requireNonNull(postToResponse(timeout, timeUnit, mediaType, url, requestMessage, headerMap).body()).string(); } /** * json格式post请求 获取流 * * @param url 接口地址 * @param requestMessage json格式请求参数体 */ public static InputStream postToInputStreamByJSON(final String url, final String requestMessage) throws IOException { return postToInputStreamByJSON(url, requestMessage, null); } /** * json格式post请求 获取流 带请求头 * * @param url 接口地址 * @param requestMessage json格式请求参数体 * @param headerMap map格式请求头参数体 */ public static InputStream postToInputStreamByJSON(final String url, String requestMessage, Map<String, String> headerMap) throws IOException { return postToInputStream(30, TimeUnit.SECONDS, JSON, url, requestMessage, headerMap); } /** * post公用方法 获取文件流 带参数 * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param mediaType 类型 * @param url url 地址 * @param requestMessage 请求的String格式参数体 * @param headerMap 请求头的map格式参数体 */ public static InputStream postToInputStream(final int timeout, final TimeUnit timeUnit, final MediaType mediaType, final String url, final String requestMessage, Map<String, String> headerMap) throws IOException { return Objects.requireNonNull(postToResponse(timeout, timeUnit, mediaType, url, requestMessage, headerMap).body()).byteStream(); } /** * post公用方法 获取返回体 带参数 * 注:client.newCall(request).execute();返回的Response response.body();只能调用一次; * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param mediaType 类型 * @param url url 地址 * @param requestMessage 请求的String格式参数体 * @param headerMap 请求头的map格式参数体 */ public static Response postToResponse(final int timeout, final TimeUnit timeUnit, final MediaType mediaType, final String url, final String requestMessage, Map<String, String> headerMap) throws IOException { Request request = new Request.Builder() .url(url) .post(RequestBody.create(requestMessage, mediaType)) // .post(RequestBody.create(mediaType, requestMessage)) .build(); if (headerMap != null && !headerMap.isEmpty()) { request = getNewRequest(request, headerMap); } OkHttpClient client = new OkHttpClient.Builder() .retryOnConnectionFailure(false) // .sslSocketFactory(createSSLSocketFactory(), new OkHttpUtils.TrustAllManager()) // .hostnameVerifier(new OkHttpUtils.TrustAllHostnameVerifier()) // .callTimeout(timeout, timeUnit) .connectTimeout(timeout, timeUnit) .readTimeout(timeout, timeUnit) // .addInterceptor(new LogInterceptor()) .build(); return client.newCall(request).execute(); } /** * post map格式 请求接口调用 * * @param url 接口地址 * @param mapParams map格式请求参数体 */ public static String postByMap(final String url, Map<String, String> mapParams) throws IOException { return postByMap(url, mapParams, null); } /** * post map格式 请求接口调用 * * @param url 接口地址 * @param mapParams map格式请求参数体 * @param headerMap map格式请求头参数体 */ public static String postByMap(final String url, Map<String, String> mapParams, Map<String, String> headerMap) throws IOException { return postByMap(30, TimeUnit.SECONDS, url, mapParams, headerMap); } /** * post map格式公用方法 带参数 * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param url url 地址 * @param mapParams map格式请求参数体 * @param headerMap 请求头的map格式参数体 */ public static String postByMap(final int timeout, final TimeUnit timeUnit, final String url, Map<String, String> mapParams, Map<String, String> headerMap) throws IOException { return Objects.requireNonNull(postToResponseByForm(timeout, timeUnit, url, mapParams, headerMap).body()).string(); } /** * post map格式 获取流 方法 * * @param url url 地址 * @param mapParams map格式请求参数体 */ public static InputStream postToInputStreamByMap(final String url, Map<String, String> mapParams) throws IOException { return postToInputStreamByMap(url, mapParams, null); } /** * post map格式 获取流 方法 带参数 * * @param url url 地址 * @param mapParams map格式请求参数体 * @param headerMap 请求头的map格式参数体 */ public static InputStream postToInputStreamByMap(final String url, Map<String, String> mapParams, Map<String, String> headerMap) throws IOException { return postToInputStreamByMap(30, TimeUnit.SECONDS, url, mapParams, headerMap); } /** * post map格式 获取流 公用方法 带参数 * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param url url 地址 * @param mapParams map格式请求参数体 * @param headerMap 请求头的map格式参数体 */ public static InputStream postToInputStreamByMap(final int timeout, final TimeUnit timeUnit, final String url, Map<String, String> mapParams, Map<String, String> headerMap) throws IOException { return Objects.requireNonNull(postToResponseByForm(timeout, timeUnit, url, mapParams, headerMap).body()).byteStream(); } /** * post map格式 表单提交 公用方法 获取返回体 带参数 * * @param timeout 超时时间 * @param timeUnit 超时时间单位 * @param url url 地址 * @param mapParams 请求的map格式参数体 * @param headerMap 请求头的map格式参数体 */ public static Response postToResponseByForm(final int timeout, final TimeUnit timeUnit, final String url, Map<String, String> mapParams, Map<String, String> headerMap) throws IOException { FormBody.Builder builder = new FormBody.Builder(); if (mapParams != null && !mapParams.isEmpty()) { builder = getNewFormBody(mapParams); } FormBody formBody = builder.build(); Request request = new Request.Builder() .url(url) .post(formBody) // .post(RequestBody.create(mediaType, requestMessage)) .build(); if (headerMap != null && !headerMap.isEmpty()) { request = getNewRequest(request, headerMap); } OkHttpClient client = new OkHttpClient.Builder() .retryOnConnectionFailure(false) // .sslSocketFactory(createSSLSocketFactory(), new OkHttpUtils.TrustAllManager()) // .hostnameVerifier(new OkHttpUtils.TrustAllHostnameVerifier()) // .callTimeout(timeout, timeUnit) .connectTimeout(timeout, timeUnit) .readTimeout(timeout, timeUnit) // .addInterceptor(new LogInterceptor()) .build(); return client.newCall(request).execute(); } private static FormBody.Builder getNewFormBody(Map<String, String> mapParams) { FormBody.Builder builder = new FormBody.Builder(); for (Map.Entry<String, String> entry : mapParams.entrySet()) { builder.add(entry.getKey(), entry.getValue()); } return builder; } private static Request getNewRequest(Request request, Map<String, String> headerMap) { Request.Builder requestBuilder = request.newBuilder(); //遍历集合,将参数添加到请求头header中 for (Map.Entry<String, String> entry : headerMap.entrySet()) { requestBuilder.header(entry.getKey(), entry.getValue()); } return requestBuilder.build(); } /** * 默认信任所有的证书 * 最好加上证书认证,主流App都有自己的证书 */ private static SSLSocketFactory createSSLSocketFactory() { SSLSocketFactory sSLSocketFactory = null; try { SSLContext sc = SSLContext.getInstance("TLSv1.2"); sc.init(null, new TrustManager[]{new OkHttpUtils.TrustAllManager()}, new SecureRandom()); sSLSocketFactory = sc.getSocketFactory(); } catch (Exception e) { log.info("createSSLSocketFactory()-异常={}", e); } return sSLSocketFactory; } private static class TrustAllManager implements X509TrustManager { @Override public void checkClientTrusted(X509Certificate[] chain, String authType) { log.info("TrustAllManager.checkClientTrusted()-无信任证书!"); } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) { log.info("TrustAllManager.checkServerTrusted()-无信任证书!"); } @Override public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[]{}; } } private static class TrustAllHostnameVerifier implements HostnameVerifier { @Override public boolean verify(String requestedHost, SSLSession remoteServerSession) { return requestedHost.equalsIgnoreCase(remoteServerSession.getPeerHost()); } } public static String map2String(final Map<String, String> messageMap) { Optional<String> messageStr = messageMap.entrySet().stream() .filter(m -> m.getValue() != null && !"".equals(m.getValue()) && !"null".equals(m.getValue())) .map(map -> map.getKey() + "=" + map.getValue()) .reduce((a, b) -> a + "&" + b); return messageStr.isPresent() ? messageStr.get() : ""; } public static String cleanUrl(String url) { return url.replaceAll("/+$", ""); } } 对图片处理的工具类 /** * 通过html获取image地址 * @param resultString * @return */ public static List<String> getImgSrc (String resultString){ List<String> list = new ArrayList<String>(); //目前img标签标示有3种表达式 //<img alt="" src="1.jpg"/> <img alt="" src="1.jpg"></img> <img alt="" src="1.jpg"> //开始匹配content中的<img />标签 Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)"); Matcher m_img = p_img.matcher(resultString); boolean result_img = m_img.find(); if (result_img) { while (result_img) { //获取到匹配的<img />标签中的内容 String str_img = m_img.group(2); //开始匹配<img />标签中的src Pattern p_src = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')"); Matcher m_src = p_src.matcher(str_img); if (m_src.find()) { String str_src = m_src.group(3); list.add(str_src); } //结束匹配<img />标签中的src //匹配content中是否存在下一个<img />标签,有则继续以上步骤匹配<img />标签中的src result_img = m_img.find(); } } return list; } 获取body内容工具类 package com.hengqin.life.common.util; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; /** * ClassName:HtmlUtils * Package:com.hengqin.life.common.util * Description: * * @date:2020/10/2017:17 * @author:liuxiao */ public class HtmlUtils { public static String getBody(String path){ String body = ""; try { InputStream iStream = OkHttpUtils.getToInputStream(path); Reader reader = new InputStreamReader(iStream); BufferedReader htmlReader = new BufferedReader(reader); String line; boolean found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格 found = true; } } found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("</body") != -1) { found = true; } else { body = body + line; } } htmlReader.close(); } catch (Exception e) { e.printStackTrace(); } return body; } } 通过网络在线文档的html中取出image地址 String resultImage = OkHttpUtils.get(filesExtendVO.getFileHtml()); List<String> imgFile = ImageUtils.getImgSrc(resultImage); filesExtendVO.setImgFile(imgFile); 通过网络在线文档的html中取出body内容 String body = HtmlUtils.getBody(filesExtendVO.getFileHtml()); filesExtendVO.setContent(body);
最新回复(0)