using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net.Mime; using System.Text; using System.Threading; using System.Threading.Tasks; using Rac.Common; using Rac.Tools; namespace Rac { public class Extractor : BaseService { private static readonly HashSet invalidChars = new HashSet(Path.GetInvalidFileNameChars()); private readonly string _dbFilePath; private DataAccess _db; private Uri _homeUrl; private HashSet _hostInclude; private string[] _excludeUrlPrefixes; private Encoding _defaultEncoding; private string _webRoot; public Extractor(string dbFilePath) { _dbFilePath = Path.GetFullPath(dbFilePath); if (false == File.Exists(_dbFilePath)) { LogFatal($"Can not find database file:{_dbFilePath}"); return; } _db = new DataAccess(_dbFilePath); var conf = new ConfigAdapter(_db.GetConfigs()); _defaultEncoding = null != conf.DefaultCharset ? Encoding.GetEncoding(conf.DefaultCharset) : Encoding.UTF8; _hostInclude = new HashSet(conf.HostsInclude); _excludeUrlPrefixes = conf.UrlPrefixExclude; _homeUrl = new Uri(conf.HomeUrl); _webRoot = conf.ExtractWebRoot; } private readonly Dictionary _eitCache = new Dictionary(); private bool EntryIsTextHtml(string url) { if (_eitCache.TryGetValue(url, out var r)) return r; var entry = _db.GetEntry(url); if (entry == null || entry.StatusCode == 0) return _eitCache[url] = false; var headers = HttpHeaderUtility.ParseStringLines(entry.Headers); string contentType = null; foreach (var header in headers) { if (header.Name == "content-type") { var ct = new ContentType(header.Value); contentType = ct.MediaType; } } return _eitCache[url] = contentType == "text/html"; } private string UrlTranscode(Uri uri, string link = null, bool includedOnly = false, bool isFs = false, bool appendHtml = false) { if (true == link?.ToLower().StartsWith("mailto:")) return link; if (true == link?.ToLower().StartsWith("data:")) return link; // http://host:port/path.ext?query#hash // -->> http/host/port/path!qquery#hash.ext if (false == string.IsNullOrEmpty(link)) uri = new Uri(uri, link); var url = uri.ToString(); if (includedOnly) { if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link; var hos = uri.Host; if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link; } var sbt = new StringBuilder(); sbt.Append($"{uri.Scheme}/{uri.Host}/{uri.Port}"); var lPath = uri.LocalPath; var fPath = Path.GetDirectoryName(lPath)?.Replace("\\", "/").TrimStart('/'); if (string.IsNullOrWhiteSpace(fPath) == false) { var parts = fPath.Split('/'); foreach (var part in parts) { sbt.Append("/"); AppendEscape(part); sbt.Append("!"); } } var fName = Path.GetFileNameWithoutExtension(lPath); var fExt = Path.GetExtension(lPath); sbt.Append($"/{fName}"); if (string.IsNullOrWhiteSpace(uri.Query) == false) AppendEscape(uri.Query); if (string.IsNullOrWhiteSpace(fExt) == false) AppendEscape(fExt); if (sbt[sbt.Length - 1] == '/') sbt.Append("!index.html"); if (isFs == false && string.IsNullOrWhiteSpace(uri.Fragment) == false) sbt.Append(uri.Fragment); var fin = sbt.ToString(); if (EntryIsTextHtml(url) && fin.EndsWith(".html") == false && fin.EndsWith(".htm") == false) { sbt.Append(".html"); return sbt.ToString(); } return fin; void AppendEscape(string input) { foreach (var c in input) { switch (c) { case '?': sbt.Append("!q"); break; case '*': sbt.Append("!s"); break; case '!': sbt.Append("!e"); break; default: if (invalidChars.Contains(c)) sbt.Append($"!{(int)c:X2}"); else sbt.Append(c); break; } } } } private string UrlTranscodeR(Uri uri, string link, bool includedOnly = false) { if (true == link?.ToLower().StartsWith("mailto:")) return link; if (true == link?.ToLower().StartsWith("data:")) return link; if (includedOnly) { var url = (false == string.IsNullOrEmpty(link) ? new Uri(uri, link) : uri).ToString(); if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link; var hos = uri.Host; if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link; } var tUrl = UrlTranscode(uri, link); if (string.IsNullOrWhiteSpace(_webRoot)) { var bStr = "file://fake/" + UrlTranscode(uri); var aStr = "file://fake/" + tUrl; var bU = new Uri(bStr); var aU = new Uri(aStr); var r = bU.MakeRelativeUri(aU); return r.ToString(); } return _webRoot + tUrl; } public void RunExtract() { var outputDir = Path.Combine(Path.GetDirectoryName(_dbFilePath), "RacExtract"); var allUrl = _db.GetAllUrl(); foreach (var url in allUrl) { var uri = new Uri(url); var entry = _db.GetEntry(url); if (entry == null || entry.StatusCode == 0) continue; var headers = HttpHeaderUtility.ParseStringLines(entry.Headers); string contentType = null; var contentEncoding = _defaultEncoding; foreach (var header in headers) { if (header.Name == "content-type") { var ct = new ContentType(header.Value); contentType = ct.MediaType; if (null != ct.CharSet) contentEncoding = Encoding.GetEncoding(ct.CharSet); } } var fsPath = Path.GetFullPath(Path.Combine(outputDir, UrlTranscode(uri, isFs: true))); if (File.Exists(fsPath)) { LogInfo($"SKIP EXIST: {fsPath}"); continue; } string location = null; foreach (var header in headers) { if (header.Name == "location") location = UrlTranscodeR(uri, header.Value); } var output = entry.Content; if (contentType == "text/html") { var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTranscodeR(uri, p, true), ref contentEncoding); if (location != null) replaced += $"
Location:{location} "; output = contentEncoding.GetBytes(replaced); } else if (contentType == "text/css") { var css = contentEncoding.GetString(entry.Content); var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTranscodeR(uri, p, true)); output = contentEncoding.GetBytes(replaced); } LogInfo($"Output: {fsPath}"); { var fsDir = Path.GetDirectoryName(fsPath); if (Directory.Exists(fsDir) == false) Directory.CreateDirectory(fsDir); File.WriteAllBytes(fsPath, output); } } } public override void Start() { throw new NotImplementedException(); } public override void Stop() { throw new NotImplementedException(); } } }