Browse Source

add feature: Extractor to filesystem; chore: fix typo

HOME 5 months ago
parent
commit
6977cbe280

+ 0 - 2
Rac.Core/Crawler.cs

@@ -196,9 +196,7 @@ namespace Rac
                 }
 
                 var results = urls
-#if !DEBUG
                     .AsParallel().WithDegreeOfParallelism(conf.Parallel)
-#endif
                     .Select(PageProc)
                     .ToArray();
 

+ 247 - 0
Rac.Core/Extractor.cs

@@ -0,0 +1,247 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Net.Mime;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+using Rac.Common;
+using Rac.Tools;
+
+namespace Rac
+{
+    public class Extractor : BaseService
+    {
+        private static readonly HashSet<char> invalidChars = new HashSet<char>(Path.GetInvalidFileNameChars());
+
+        private readonly string _dbFilePath;
+
+        private DataAccess _db;
+        private Uri _homeUrl;
+        private HashSet<string> _hostInclude;
+        private string[] _excludeUrlPrefixes;
+
+        private Encoding _defaultEncoding;
+        private string _webRoot;
+
+        public Extractor(string dbFilePath)
+        {
+            _dbFilePath = Path.GetFullPath(dbFilePath);
+
+            if (false == File.Exists(_dbFilePath))
+            {
+                LogFatal($"Can not find database file:{_dbFilePath}");
+                return;
+            }
+
+            _db = new DataAccess(_dbFilePath);
+            var conf = new ConfigAdapter(_db.GetConfigs());
+
+            _defaultEncoding = null != conf.DefaultCharset ? Encoding.GetEncoding(conf.DefaultCharset) : Encoding.UTF8;
+
+            _hostInclude = new HashSet<string>(conf.HostsInclude);
+            _excludeUrlPrefixes = conf.UrlPrefixExclude;
+
+            _homeUrl = new Uri(conf.HomeUrl);
+            _webRoot = conf.ExtractWebRoot;
+        }
+
+        private readonly Dictionary<string, bool> _eitCache = new Dictionary<string, bool>();
+
+        private bool EntryIsTextHtml(string url)
+        {
+            if (_eitCache.TryGetValue(url, out var r)) return r;
+            var entry = _db.GetEntry(url);
+
+            if (entry == null || entry.StatusCode == 0) return _eitCache[url] = false;
+
+            var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
+            string contentType = null;
+
+            foreach (var header in headers)
+            {
+                if (header.Name == "content-type")
+                {
+                    var ct = new ContentType(header.Value);
+                    contentType = ct.MediaType;
+                }
+            }
+
+            return _eitCache[url] = contentType == "text/html";
+        }
+
+        private string UrlTranscode(Uri uri, string link = null, bool includedOnly = false, bool isFs = false, bool appendHtml = false)
+        {
+            if (true == link?.ToLower().StartsWith("mailto:")) return link;
+            if (true == link?.ToLower().StartsWith("data:")) return link;
+
+            //       http://host:port/path.ext?query#hash
+            // -->>    http/host/port/path!qquery#hash.ext
+            if (false == string.IsNullOrEmpty(link)) uri = new Uri(uri, link);
+
+            var url = uri.ToString();
+            if (includedOnly)
+            {
+                if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
+
+                var hos = uri.Host;
+                if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
+            }
+
+            var sbt = new StringBuilder();
+            sbt.Append($"{uri.Scheme}/{uri.Host}/{uri.Port}");
+
+            var lPath = uri.LocalPath;
+
+            var fPath = Path.GetDirectoryName(lPath)?.Replace("\\", "/").TrimStart('/');
+            if (string.IsNullOrWhiteSpace(fPath) == false)
+            {
+                var parts = fPath.Split('/');
+                foreach (var part in parts)
+                {
+                    sbt.Append("/");
+                    AppendEscape(part);
+                    sbt.Append("!");
+                }
+            }
+            var fName = Path.GetFileNameWithoutExtension(lPath);
+            var fExt = Path.GetExtension(lPath);
+
+            sbt.Append($"/{fName}");
+            if (string.IsNullOrWhiteSpace(uri.Query) == false) AppendEscape(uri.Query);
+            if (string.IsNullOrWhiteSpace(fExt) == false) AppendEscape(fExt);
+            if (sbt[sbt.Length - 1] == '/') sbt.Append("!index.html");
+            if (isFs == false && string.IsNullOrWhiteSpace(uri.Fragment) == false) sbt.Append(uri.Fragment);
+
+            var fin = sbt.ToString();
+
+            if (EntryIsTextHtml(url) && fin.EndsWith(".html") == false && fin.EndsWith(".htm") == false)
+            {
+                sbt.Append(".html");
+                return sbt.ToString();
+            }
+
+            return fin;
+
+            void AppendEscape(string input)
+            {
+                foreach (var c in input)
+                {
+                    switch (c)
+                    {
+                        case '?': sbt.Append("!q"); break;
+                        case '*': sbt.Append("!s"); break;
+                        case '!': sbt.Append("!e"); break;
+                        default:
+                            if (invalidChars.Contains(c)) sbt.Append($"!{(int)c:X2}");
+                            else sbt.Append(c);
+                            break;
+                    }
+                }
+            }
+        }
+
+        private string UrlTranscodeR(Uri uri, string link, bool includedOnly = false)
+        {
+            if (true == link?.ToLower().StartsWith("mailto:")) return link;
+            if (true == link?.ToLower().StartsWith("data:")) return link;
+
+            if (includedOnly)
+            {
+                var url = (false == string.IsNullOrEmpty(link) ? new Uri(uri, link) : uri).ToString();
+                if (_excludeUrlPrefixes.Any(p => url.StartsWith(p))) return link;
+
+                var hos = uri.Host;
+                if (hos != _homeUrl.Host && false == _hostInclude.Contains(hos)) return link;
+            }
+
+            var tUrl = UrlTranscode(uri, link);
+
+            if (string.IsNullOrWhiteSpace(_webRoot))
+            {
+                var bStr = "file://fake/" + UrlTranscode(uri);
+                var aStr = "file://fake/" + tUrl;
+
+                var bU = new Uri(bStr);
+                var aU = new Uri(aStr);
+                var r = bU.MakeRelativeUri(aU);
+
+                return r.ToString();
+            }
+
+            return _webRoot + tUrl;
+        }
+
+        public void RunExtract()
+        {
+            var outputDir = Path.Combine(Path.GetDirectoryName(_dbFilePath), "RacExtract");
+            var allUrl = _db.GetAllUrl();
+            foreach (var url in allUrl)
+            {
+                var uri = new Uri(url);
+                var entry = _db.GetEntry(url);
+                if (entry == null || entry.StatusCode == 0) continue;
+
+                var headers = HttpHeaderUtility.ParseStringLines(entry.Headers);
+                string contentType = null;
+                var contentEncoding = _defaultEncoding;
+
+                foreach (var header in headers)
+                {
+                    if (header.Name == "content-type")
+                    {
+                        var ct = new ContentType(header.Value);
+                        contentType = ct.MediaType;
+                        if (null != ct.CharSet) contentEncoding = Encoding.GetEncoding(ct.CharSet);
+                    }
+                }
+
+                var fsPath = Path.GetFullPath(Path.Combine(outputDir, UrlTranscode(uri, isFs: true)));
+                if (File.Exists(fsPath))
+                {
+                    LogInfo($"SKIP EXIST: {fsPath}");
+                    continue;
+                }
+
+                string location = null;
+                foreach (var header in headers)
+                {
+                    if (header.Name == "location") location = UrlTranscodeR(uri, header.Value);
+                }
+
+                var output = entry.Content;
+
+                if (contentType == "text/html")
+                {
+                    var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTranscodeR(uri, p, true), ref contentEncoding);
+                    if (location != null) replaced += $"<br>Location:<a href=\"{location}\">{location}</a> <meta http-equiv=\"refresh\" content=\"1; url={location}\">";
+                    output = contentEncoding.GetBytes(replaced);
+                }
+                else if (contentType == "text/css")
+                {
+                    var css = contentEncoding.GetString(entry.Content);
+                    var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTranscodeR(uri, p, true));
+                    output = contentEncoding.GetBytes(replaced);
+                }
+
+                LogInfo($"Output: {fsPath}");
+                {
+                    var fsDir = Path.GetDirectoryName(fsPath);
+                    if (Directory.Exists(fsDir) == false) Directory.CreateDirectory(fsDir);
+                    File.WriteAllBytes(fsPath, output);
+                }
+            }
+        }
+
+        public override void Start()
+        {
+            throw new NotImplementedException();
+        }
+
+        public override void Stop()
+        {
+            throw new NotImplementedException();
+        }
+    }
+}

+ 5 - 5
Rac.Core/OfflineWebServer.cs

@@ -51,7 +51,7 @@ namespace Rac
             LogInfo($"OWS runing on " + (string.Join("|", _server.Prefixes)));
         }
 
-        private string UrlTrancode(Uri uri, string link = null, bool includedOnly = false)
+        private string UrlTranscode(Uri uri, string link = null, bool includedOnly = false)
         {
             if (true == link?.ToLower().StartsWith("mailto:")) return link;
 
@@ -81,7 +81,7 @@ namespace Rac
 
             if (ctx.Request.Url.LocalPath == "/")
             {
-                ctx.Response.Redirect(UrlTrancode(_homeUrl));
+                ctx.Response.Redirect(UrlTranscode(_homeUrl));
             }
             else
             {
@@ -132,7 +132,7 @@ namespace Rac
 
                             foreach (var header in headers)
                             {
-                                if (header.Name == "location") header.Value = UrlTrancode(decodedUri, header.Value);
+                                if (header.Name == "location") header.Value = UrlTranscode(decodedUri, header.Value);
                                 if (header.Name == "content-type")
                                 {
                                     var ct = new ContentType(header.Value);
@@ -146,13 +146,13 @@ namespace Rac
 
                             if (contentType == "text/html")
                             {
-                                var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTrancode(decodedUri, p, true), ref contentEncoding);
+                                var replaced = LinkProcessor.ReplaceHtmlLinks(entry.Content, p => UrlTranscode(decodedUri, p, true), ref contentEncoding);
                                 output = contentEncoding.GetBytes(replaced);
                             }
                             else if (contentType == "text/css")
                             {
                                 var css = contentEncoding.GetString(entry.Content);
-                                var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTrancode(decodedUri, p, true));
+                                var replaced = LinkProcessor.ReplaceCssLinks(css, p => UrlTranscode(decodedUri, p, true));
                                 output = contentEncoding.GetBytes(replaced);
                             }
 

+ 1 - 0
Rac.Core/Rac.Core.csproj

@@ -102,6 +102,7 @@
   </ItemGroup>
   <ItemGroup>
     <Compile Include="Crawler.cs" />
+    <Compile Include="Extractor.cs" />
     <Compile Include="FileBrowseServer.cs" />
     <Compile Include="Models\HttpHeader.cs" />
     <Compile Include="Tools\DataAccess.cs" />

+ 1 - 0
Rac.Core/Tools/ConfigAdapter.cs

@@ -47,5 +47,6 @@ namespace Rac.Tools
         public int OwsPort => GetInt32Value();
 
         public string DefaultCharset => GetValue();
+        public string ExtractWebRoot => GetValue();
     }
 }

+ 3 - 2
Rac.Core/Tools/DataAccess.cs

@@ -60,6 +60,7 @@ namespace Rac.Tools
                      + $"INSERT INTO Configs ({nameof(ConfigEntry.Key)}) VALUES ('{nameof(ConfigAdapter.OwsPort)}');"
                      + $"INSERT INTO Configs ({nameof(ConfigEntry.Key)}) VALUES ('{nameof(ConfigAdapter.Parallel)}');"
                      + $"INSERT INTO Configs ({nameof(ConfigEntry.Key)}) VALUES ('{nameof(ConfigAdapter.DefaultCharset)}');"
+                     + $"INSERT INTO Configs ({nameof(ConfigEntry.Key)}) VALUES ('{nameof(ConfigAdapter.ExtractWebRoot)}');"
                 );
             }
         }
@@ -199,7 +200,7 @@ namespace Rac.Tools
             }
         }
 
-        // --- for offline server ---
+        // --- for offline server & extractor ---
 
         public ArchiveEntry GetEntry(string url)
         {
@@ -209,7 +210,7 @@ namespace Rac.Tools
             }
         }
 
-        // --- for file browser ---
+        // --- for extractor  ---
 
         public string[] GetAllUrl()
         {

+ 13 - 3
Rac.Executer/Program.cs

@@ -15,7 +15,8 @@ namespace Rac
                 Console.WriteLine("args: [mode] [args]");
                 Console.WriteLine("      crawler [project.db3] * default mode");
                 Console.WriteLine("      offline [project.db3]");
-                Console.WriteLine("      browser [project.db3]");
+                Console.WriteLine("      browser [project.db3] * not impl yet");
+                Console.WriteLine("      extract [project.db3]");
             }
 
             if (args.Length == 0)
@@ -43,6 +44,10 @@ namespace Rac
                     case "browser":
                         RunBrowser(db);
                         break;
+
+                    case "extract":
+                        RunExtract(db);
+                        break;
                 }
             }
 
@@ -50,6 +55,12 @@ namespace Rac
             Console.ReadLine();
         }
 
+        private static void RunExtract(string db)
+        {
+            var inst = new Extractor(db);
+            inst.Log += (sender, ea) => Console.WriteLine($"Extract -- {ea.Level} -- {ea.Log}");
+            inst.RunExtract();
+        }
 
         private static void RunCrawler(string db)
         {
@@ -91,6 +102,5 @@ namespace Rac
             Console.ReadLine();
             inst.Stop();
         }
-
     }
-}
+}