123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- using Alba.CsCss.Style;
- using AngleSharp;
- using AngleSharp.Html.Dom;
- using AngleSharp.Html.Parser;
- using System;
- using System.IO;
- using System.Linq;
- using System.Text;
- namespace Rac.Tools
- {
- // This class can change to instance class
- // Implement different driver (StringParser/WebBrowserEmu/Etc)
- public class LinkProcessor
- {
- private static readonly CssLoader CssLoader = new CssLoader();
- public static string[] FromCss(string css)
- {
- return CssLoader.GetUris(css).ToArray();
- }
- public static string[] ExtractLinks(IHtmlDocument doc)
- {
- //extract urls from embedded style
- var links = doc.QuerySelectorAll("style").Select(p => FromCss(p.TextContent)).SelectMany(p => p).ToList();
- //extract urls from page
- links.AddRange(doc.QuerySelectorAll("*[src],*[href]")
- .Select(e => e.Attributes["src"]?.Value ?? e.Attributes["href"]?.Value)
- .Where(p => p != null));
- return links.ToArray();
- }
- public static string ReplaceCssLinks(string css, Func<string, string> func)
- {
- var links = FromCss(css).OrderByDescending(p => p.Length).Distinct().Where(s => string.IsNullOrWhiteSpace(s) == false).ToArray();
- foreach (var link in links)
- {
- if (link != "//")
- css = css.Replace(link, func(link));
- }
- return css;
- }
- public static string ReplaceHtmlLinks(byte[] html, Func<string, string> func, ref Encoding encoding)
- {
- IHtmlDocument doc;
- if (null == encoding)
- {
- using var stream = new MemoryStream(html);
- doc = new HtmlParser().ParseDocument(stream);
- }
- else
- {
- var source = encoding.GetString(html);
- doc = new HtmlParser().ParseDocument(source);
- }
- var styles = doc.QuerySelectorAll("style");
- foreach (var style in styles)
- {
- style.TextContent = ReplaceCssLinks(style.TextContent, func);
- }
- foreach (var srcEl in doc.QuerySelectorAll("*[src]")) if (srcEl.Attributes["src"]?.Value != "//") srcEl.Attributes["src"].Value = func(srcEl.Attributes["src"].Value);
- foreach (var srcEl in doc.QuerySelectorAll("*[href]")) if (srcEl.Attributes["href"]?.Value != "//") srcEl.Attributes["href"].Value = func(srcEl.Attributes["href"].Value);
- if (null == encoding) encoding = Encoding.GetEncoding(doc.CharacterSet);
- return doc.ToHtml();
- }
- }
- }
|