LinkProcessor.cs 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. using Alba.CsCss.Style;
  2. using AngleSharp;
  3. using AngleSharp.Html.Dom;
  4. using AngleSharp.Html.Parser;
  5. using System;
  6. using System.IO;
  7. using System.Linq;
  8. using System.Text;
  9. namespace Rac.Tools
  10. {
  11. // This class can change to instance class
  12. // Implement different driver (StringParser/WebBrowserEmu/Etc)
  13. public class LinkProcessor
  14. {
  15. private static readonly CssLoader CssLoader = new CssLoader();
  16. public static string[] FromCss(string css)
  17. {
  18. return CssLoader.GetUris(css).ToArray();
  19. }
  20. public static string[] ExtractLinks(IHtmlDocument doc)
  21. {
  22. //extract urls from embedded style
  23. var links = doc.QuerySelectorAll("style").Select(p => FromCss(p.TextContent)).SelectMany(p => p).ToList();
  24. //extract urls from page
  25. links.AddRange(doc.QuerySelectorAll("*[src],*[href]")
  26. .Select(e => e.Attributes["src"]?.Value ?? e.Attributes["href"]?.Value)
  27. .Where(p => p != null));
  28. return links.ToArray();
  29. }
  30. public static string ReplaceCssLinks(string css, Func<string, string> func)
  31. {
  32. var links = FromCss(css).OrderByDescending(p => p.Length).Distinct().Where(s => string.IsNullOrWhiteSpace(s) == false).ToArray();
  33. foreach (var link in links)
  34. {
  35. if (link != "//")
  36. css = css.Replace(link, func(link));
  37. }
  38. return css;
  39. }
  40. public static string ReplaceHtmlLinks(byte[] html, Func<string, string> func, ref Encoding encoding)
  41. {
  42. IHtmlDocument doc;
  43. if (null == encoding)
  44. {
  45. using var stream = new MemoryStream(html);
  46. doc = new HtmlParser().ParseDocument(stream);
  47. }
  48. else
  49. {
  50. var source = encoding.GetString(html);
  51. doc = new HtmlParser().ParseDocument(source);
  52. }
  53. var styles = doc.QuerySelectorAll("style");
  54. foreach (var style in styles)
  55. {
  56. style.TextContent = ReplaceCssLinks(style.TextContent, func);
  57. }
  58. foreach (var srcEl in doc.QuerySelectorAll("*[src]")) if (srcEl.Attributes["src"]?.Value != "//") srcEl.Attributes["src"].Value = func(srcEl.Attributes["src"].Value);
  59. foreach (var srcEl in doc.QuerySelectorAll("*[href]")) if (srcEl.Attributes["href"]?.Value != "//") srcEl.Attributes["href"].Value = func(srcEl.Attributes["href"].Value);
  60. if (null == encoding) encoding = Encoding.GetEncoding(doc.CharacterSet);
  61. return doc.ToHtml();
  62. }
  63. }
  64. }