|
@@ -0,0 +1,247 @@
|
|
|
+using System;
|
|
|
+using System.Collections.Generic;
|
|
|
+using System.IO;
|
|
|
+using System.Linq;
|
|
|
+using System.Net.Mime;
|
|
|
+using System.Text;
|
|
|
+using System.Threading;
|
|
|
+using System.Threading.Tasks;
|
|
|
+using Rac.Common;
|
|
|
+using Rac.Tools;
|
|
|
+
|
|
|
+namespace Rac
|
|
|
+{
|
|
|
+ public class Extractor : BaseService
|
|
|
+ {
|
|
|
+ private static readonly HashSet<char> invalidChars = new HashSet<char>(Path.GetInvalidFileNameChars());
|
|
|
+
|
|
|
+ private readonly string _dbFilePath;
|
|
|
+
|
|
|
+ private DataAccess _db;
|
|
|
+ private Uri _homeUrl;
|
|
|
+ private HashSet<string> _hostInclude;
|
|
|
+ private string[] _excludeUrlPrefixes;
|
|
|
+
|
|
|
+ private Encoding _defaultEncoding;
|
|
|
+ private string _webRoot;
|
|
|
+
|
|
|
+ public Extractor(string dbFilePath)
|
|
|
+ {
|
|
|
+ _dbFilePath = Path.GetFullPath(dbFilePath);
|
|
|
+
|
|
|
+ if (false == File.Exists(_dbFilePath))
|
|
|
+ {
|
|
|
+ LogFatal($"Can not find database file:{_dbFilePath}");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ _db = new DataAccess(_dbFilePath);
|
|
|
+ var conf = new ConfigAdapter(_db.GetConfigs());
|
|
|
+
|
|
|
+ _defaultEncoding = null != conf.DefaultCharset ? Encoding.GetEncoding(conf.DefaultCharset) : Encoding.UTF8;
|
|
|
+
|
|
|
+ _hostInclude = new HashSet<string>(conf.HostsInclude);
|
|
|
+ _excludeUrlPrefixes = conf.UrlPrefixExclude;
|
|
|
+
|
|
|
+ _homeUrl = new Uri(conf.HomeUrl);
|
|
|
+ _webRoot = conf.ExtractWebRoot;
|
|
|
+ }
|
|
|
+
|
|
|
+ private readonly Dictionary<string, bool> _eitCache = new Dictionary<string, bool>();
|
|
|
+
|
|
|
+ private bool EntryIsTextHtml(string url)
|
|
|
+ {
|
|
|
+ if (_eitCache.TryGetValue(url, out var r)) return r;
|
|
|
+ var entry = _db.GetEntry(url);
|
|
|
+
|
|
|
+ if (entry == null || entry.StatusCode == 0) return _eitCache[url] = false;
|
|
|
+
|
|
|
+ var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
|
|
|
+ string contentType = null;
|
|
|
+
|
|
|
+ foreach (var header in headers)
|
|
|
+ {
|
|
|
+ if (header.Name == "content-type")
|
|
|
+ {
|
|
|
+ var ct = new ContentType(header.Value);
|
|
|
+ contentType = ct.MediaType;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return _eitCache[url] = contentType == "text/html";
|
|
|
+ }
|
|
|
+
|
|
|
+ private string UrlTranscode(Uri uri, string link = null, bool includedOnly = false, bool isFs = false, bool appendHtml = false)
|
|
|
+ {
|
|
|
+ if (true == link?.ToLower().StartsWith("mailto:")) return link;
|
|
|
+ if (true == link?.ToLower().StartsWith("data:")) return link;
|
|
|
+
|
|
|
+ // http://host:port/path.ext?query#hash
|
|
|
+ // -->> http/host/port/path!qquery#hash.ext
|
|
|
+ if (false == string.IsNullOrEmpty(link)) uri = new Uri(uri, link);
|
|
|
+
|
|
|
+ var url = uri.ToString();
|
|
|
+ if (includedOnly)
|
|
|
+ {
|
|
|
+ if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
|
|
|
+
|
|
|
+ var hos = uri.Host;
|
|
|
+ if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
|
|
|
+ }
|
|
|
+
|
|
|
+ var sbt = new StringBuilder();
|
|
|
+ sbt.Append($"{uri.Scheme}/{uri.Host}/{uri.Port}");
|
|
|
+
|
|
|
+ var lPath = uri.LocalPath;
|
|
|
+
|
|
|
+ var fPath = Path.GetDirectoryName(lPath)?.Replace("\\", "/").TrimStart('/');
|
|
|
+ if (string.IsNullOrWhiteSpace(fPath) == false)
|
|
|
+ {
|
|
|
+ var parts = fPath.Split('/');
|
|
|
+ foreach (var part in parts)
|
|
|
+ {
|
|
|
+ sbt.Append("/");
|
|
|
+ AppendEscape(part);
|
|
|
+ sbt.Append("!");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ var fName = Path.GetFileNameWithoutExtension(lPath);
|
|
|
+ var fExt = Path.GetExtension(lPath);
|
|
|
+
|
|
|
+ sbt.Append($"/{fName}");
|
|
|
+ if (string.IsNullOrWhiteSpace(uri.Query) == false) AppendEscape(uri.Query);
|
|
|
+ if (string.IsNullOrWhiteSpace(fExt) == false) AppendEscape(fExt);
|
|
|
+ if (sbt[sbt.Length - 1] == '/') sbt.Append("!index.html");
|
|
|
+ if (isFs == false && string.IsNullOrWhiteSpace(uri.Fragment) == false) sbt.Append(uri.Fragment);
|
|
|
+
|
|
|
+ var fin = sbt.ToString();
|
|
|
+
|
|
|
+ if (EntryIsTextHtml(url) && fin.EndsWith(".html") == false && fin.EndsWith(".htm") == false)
|
|
|
+ {
|
|
|
+ sbt.Append(".html");
|
|
|
+ return sbt.ToString();
|
|
|
+ }
|
|
|
+
|
|
|
+ return fin;
|
|
|
+
|
|
|
+ void AppendEscape(string input)
|
|
|
+ {
|
|
|
+ foreach (var c in input)
|
|
|
+ {
|
|
|
+ switch (c)
|
|
|
+ {
|
|
|
+ case '?': sbt.Append("!q"); break;
|
|
|
+ case '*': sbt.Append("!s"); break;
|
|
|
+ case '!': sbt.Append("!e"); break;
|
|
|
+ default:
|
|
|
+ if (invalidChars.Contains(c)) sbt.Append($"!{(int)c:X2}");
|
|
|
+ else sbt.Append(c);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private string UrlTranscodeR(Uri uri, string link, bool includedOnly = false)
|
|
|
+ {
|
|
|
+ if (true == link?.ToLower().StartsWith("mailto:")) return link;
|
|
|
+ if (true == link?.ToLower().StartsWith("data:")) return link;
|
|
|
+
|
|
|
+ if (includedOnly)
|
|
|
+ {
|
|
|
+ var url = (false == string.IsNullOrEmpty(link) ? new Uri(uri, link) : uri).ToString();
|
|
|
+ if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
|
|
|
+
|
|
|
+ var hos = uri.Host;
|
|
|
+ if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
|
|
|
+ }
|
|
|
+
|
|
|
+ var tUrl = UrlTranscode(uri, link);
|
|
|
+
|
|
|
+ if (string.IsNullOrWhiteSpace(_webRoot))
|
|
|
+ {
|
|
|
+ var bStr = "file://fake/" + UrlTranscode(uri);
|
|
|
+ var aStr = "file://fake/" + tUrl;
|
|
|
+
|
|
|
+ var bU = new Uri(bStr);
|
|
|
+ var aU = new Uri(aStr);
|
|
|
+ var r = bU.MakeRelativeUri(aU);
|
|
|
+
|
|
|
+ return r.ToString();
|
|
|
+ }
|
|
|
+
|
|
|
+ return _webRoot + tUrl;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void RunExtract()
|
|
|
+ {
|
|
|
+ var outputDir = Path.Combine(Path.GetDirectoryName(_dbFilePath), "RacExtract");
|
|
|
+ var allUrl = _db.GetAllUrl();
|
|
|
+ foreach (var url in allUrl)
|
|
|
+ {
|
|
|
+ var uri = new Uri(url);
|
|
|
+ var entry = _db.GetEntry(url);
|
|
|
+ if (entry == null || entry.StatusCode == 0) continue;
|
|
|
+
|
|
|
+ var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
|
|
|
+ string contentType = null;
|
|
|
+ var contentEncoding = _defaultEncoding;
|
|
|
+
|
|
|
+ foreach (var header in headers)
|
|
|
+ {
|
|
|
+ if (header.Name == "content-type")
|
|
|
+ {
|
|
|
+ var ct = new ContentType(header.Value);
|
|
|
+ contentType = ct.MediaType;
|
|
|
+ if (null != ct.CharSet) contentEncoding = Encoding.GetEncoding(ct.CharSet);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ var fsPath = Path.GetFullPath(Path.Combine(outputDir, UrlTranscode(uri, isFs: true)));
|
|
|
+ if (File.Exists(fsPath))
|
|
|
+ {
|
|
|
+ LogInfo($"SKIP EXIST: {fsPath}");
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ string location = null;
|
|
|
+ foreach (var header in headers)
|
|
|
+ {
|
|
|
+ if (header.Name == "location") location = UrlTranscodeR(uri, header.Value);
|
|
|
+ }
|
|
|
+
|
|
|
+ var output = entry.Content;
|
|
|
+
|
|
|
+ if (contentType == "text/html")
|
|
|
+ {
|
|
|
+ var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTranscodeR(uri, p, true), ref contentEncoding);
|
|
|
+ if (location != null) replaced += $"<br>Location:<a href=\"{location}\">{location}</a> <meta http-equiv=\"refresh\" content=\"1; url={location}\">";
|
|
|
+ output = contentEncoding.GetBytes(replaced);
|
|
|
+ }
|
|
|
+ else if (contentType == "text/css")
|
|
|
+ {
|
|
|
+ var css = contentEncoding.GetString(entry.Content);
|
|
|
+ var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTranscodeR(uri, p, true));
|
|
|
+ output = contentEncoding.GetBytes(replaced);
|
|
|
+ }
|
|
|
+
|
|
|
+ LogInfo($"Output: {fsPath}");
|
|
|
+ {
|
|
|
+ var fsDir = Path.GetDirectoryName(fsPath);
|
|
|
+ if (Directory.Exists(fsDir) == false) Directory.CreateDirectory(fsDir);
|
|
|
+ File.WriteAllBytes(fsPath, output);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public override void Start()
|
|
|
+ {
|
|
|
+ throw new NotImplementedException();
|
|
|
+ }
|
|
|
+
|
|
|
+ public override void Stop()
|
|
|
+ {
|
|
|
+ throw new NotImplementedException();
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|