Extractor.cs 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Net.Mime;
  6. using System.Text;
  7. using System.Threading;
  8. using System.Threading.Tasks;
  9. using Rac.Common;
  10. using Rac.Tools;
  11. namespace Rac
  12. {
  13. public class Extractor : BaseService
  14. {
  15. private static readonly HashSet<char> invalidChars = new HashSet<char>(Path.GetInvalidFileNameChars());
  16. private readonly string _dbFilePath;
  17. private DataAccess _db;
  18. private Uri _homeUrl;
  19. private HashSet<string> _hostInclude;
  20. private string[] _excludeUrlPrefixes;
  21. private Encoding _defaultEncoding;
  22. private string _webRoot;
  23. public Extractor(string dbFilePath)
  24. {
  25. _dbFilePath = Path.GetFullPath(dbFilePath);
  26. if (false == File.Exists(_dbFilePath))
  27. {
  28. LogFatal($"Can not find database file:{_dbFilePath}");
  29. return;
  30. }
  31. _db = new DataAccess(_dbFilePath);
  32. var conf = new ConfigAdapter(_db.GetConfigs());
  33. _defaultEncoding = null != conf.DefaultCharset ? Encoding.GetEncoding(conf.DefaultCharset) : Encoding.UTF8;
  34. _hostInclude = new HashSet<string>(conf.HostsInclude);
  35. _excludeUrlPrefixes = conf.UrlPrefixExclude;
  36. _homeUrl = new Uri(conf.HomeUrl);
  37. _webRoot = conf.ExtractWebRoot;
  38. }
  39. private readonly Dictionary<string, bool> _eitCache = new Dictionary<string, bool>();
  40. private bool EntryIsTextHtml(string url)
  41. {
  42. if (_eitCache.TryGetValue(url, out var r)) return r;
  43. var entry = _db.GetEntry(url);
  44. if (entry == null || entry.StatusCode == 0) return _eitCache[url] = false;
  45. var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
  46. string contentType = null;
  47. foreach (var header in headers)
  48. {
  49. if (header.Name == "content-type")
  50. {
  51. var ct = new ContentType(header.Value);
  52. contentType = ct.MediaType;
  53. }
  54. }
  55. return _eitCache[url] = contentType == "text/html";
  56. }
  57. private string UrlTranscode(Uri uri, string link = null, bool includedOnly = false, bool isFs = false, bool appendHtml = false)
  58. {
  59. if (true == link?.ToLower().StartsWith("mailto:")) return link;
  60. if (true == link?.ToLower().StartsWith("data:")) return link;
  61. // http://host:port/path.ext?query#hash
  62. // -->> http/host/port/path!qquery#hash.ext
  63. if (false == string.IsNullOrEmpty(link)) uri = new Uri(uri, link);
  64. var url = uri.ToString();
  65. if (includedOnly)
  66. {
  67. if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
  68. var hos = uri.Host;
  69. if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
  70. }
  71. var sbt = new StringBuilder();
  72. sbt.Append($"{uri.Scheme}/{uri.Host}/{uri.Port}");
  73. var lPath = uri.LocalPath;
  74. var fPath = Path.GetDirectoryName(lPath)?.Replace("\\", "/").TrimStart('/');
  75. if (string.IsNullOrWhiteSpace(fPath) == false)
  76. {
  77. var parts = fPath.Split('/');
  78. foreach (var part in parts)
  79. {
  80. sbt.Append("/");
  81. AppendEscape(part);
  82. sbt.Append("!");
  83. }
  84. }
  85. var fName = Path.GetFileNameWithoutExtension(lPath);
  86. var fExt = Path.GetExtension(lPath);
  87. sbt.Append($"/{fName}");
  88. if (string.IsNullOrWhiteSpace(uri.Query) == false) AppendEscape(uri.Query);
  89. if (string.IsNullOrWhiteSpace(fExt) == false) AppendEscape(fExt);
  90. if (sbt[sbt.Length - 1] == '/') sbt.Append("!index.html");
  91. if (isFs == false && string.IsNullOrWhiteSpace(uri.Fragment) == false) sbt.Append(uri.Fragment);
  92. var fin = sbt.ToString();
  93. if (EntryIsTextHtml(url) && fin.EndsWith(".html") == false && fin.EndsWith(".htm") == false)
  94. {
  95. sbt.Append(".html");
  96. return sbt.ToString();
  97. }
  98. return fin;
  99. void AppendEscape(string input)
  100. {
  101. foreach (var c in input)
  102. {
  103. switch (c)
  104. {
  105. case '?': sbt.Append("!q"); break;
  106. case '*': sbt.Append("!s"); break;
  107. case '!': sbt.Append("!e"); break;
  108. default:
  109. if (invalidChars.Contains(c)) sbt.Append($"!{(int)c:X2}");
  110. else sbt.Append(c);
  111. break;
  112. }
  113. }
  114. }
  115. }
  116. private string UrlTranscodeR(Uri uri, string link, bool includedOnly = false)
  117. {
  118. if (true == link?.ToLower().StartsWith("mailto:")) return link;
  119. if (true == link?.ToLower().StartsWith("data:")) return link;
  120. if (includedOnly)
  121. {
  122. var url = (false == string.IsNullOrEmpty(link) ? new Uri(uri, link) : uri).ToString();
  123. if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
  124. var hos = uri.Host;
  125. if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
  126. }
  127. var tUrl = UrlTranscode(uri, link);
  128. if (string.IsNullOrWhiteSpace(_webRoot))
  129. {
  130. var bStr = "file://fake/" + UrlTranscode(uri);
  131. var aStr = "file://fake/" + tUrl;
  132. var bU = new Uri(bStr);
  133. var aU = new Uri(aStr);
  134. var r = bU.MakeRelativeUri(aU);
  135. return r.ToString();
  136. }
  137. return _webRoot + tUrl;
  138. }
  139. public void RunExtract()
  140. {
  141. var outputDir = Path.Combine(Path.GetDirectoryName(_dbFilePath), "RacExtract");
  142. var allUrl = _db.GetAllUrl();
  143. foreach (var url in allUrl)
  144. {
  145. var uri = new Uri(url);
  146. var entry = _db.GetEntry(url);
  147. if (entry == null || entry.StatusCode == 0) continue;
  148. var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
  149. string contentType = null;
  150. var contentEncoding = _defaultEncoding;
  151. foreach (var header in headers)
  152. {
  153. if (header.Name == "content-type")
  154. {
  155. var ct = new ContentType(header.Value);
  156. contentType = ct.MediaType;
  157. if (null != ct.CharSet) contentEncoding = Encoding.GetEncoding(ct.CharSet);
  158. }
  159. }
  160. var fsPath = Path.GetFullPath(Path.Combine(outputDir, UrlTranscode(uri, isFs: true)));
  161. if (File.Exists(fsPath))
  162. {
  163. LogInfo($"SKIP EXIST: {fsPath}");
  164. continue;
  165. }
  166. string location = null;
  167. foreach (var header in headers)
  168. {
  169. if (header.Name == "location") location = UrlTranscodeR(uri, header.Value);
  170. }
  171. var output = entry.Content;
  172. if (contentType == "text/html")
  173. {
  174. var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTranscodeR(uri, p, true), ref contentEncoding);
  175. if (location != null) replaced += $"<br>Location:<a href=\"{location}\">{location}</a> <meta http-equiv=\"refresh\" content=\"1; url={location}\">";
  176. output = contentEncoding.GetBytes(replaced);
  177. }
  178. else if (contentType == "text/css")
  179. {
  180. var css = contentEncoding.GetString(entry.Content);
  181. var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTranscodeR(uri, p, true));
  182. output = contentEncoding.GetBytes(replaced);
  183. }
  184. LogInfo($"Output: {fsPath}");
  185. {
  186. var fsDir = Path.GetDirectoryName(fsPath);
  187. if (Directory.Exists(fsDir) == false) Directory.CreateDirectory(fsDir);
  188. File.WriteAllBytes(fsPath, output);
  189. }
  190. }
  191. }
  192. public override void Start()
  193. {
  194. throw new NotImplementedException();
  195. }
  196. public override void Stop()
  197. {
  198. throw new NotImplementedException();
  199. }
  200. }
  201. }