using Rac.Common; using Rac.Entities; using Rac.Models; using Rac.Tools; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; namespace Rac { //TODO: CHECK AFFECT OF `Error loading type Typespec 0x1b000044 from due to Could not resolve typespec token 1b000044' public class Crawler : BaseService { private readonly DataAccess _db; private Thread _runner; public bool IsRunning { get; private set; } public Crawler(string dbFilePath) { _db = new DataAccess(dbFilePath); } public override void Start() { LogInfo("Starting..."); IsRunning = true; _runner = new Thread(RunInternal) { IsBackground = true }; _runner.Start(); LogInfo("Started."); } public override void Stop() { IsRunning = false; _runner.Join(); LogInfo("Stopped."); } private void RunInternal() { LogDebug("Loading configs..."); var conf = new ConfigAdapter(_db.GetConfigs()); if (string.IsNullOrEmpty(conf.HomeUrl)) { LogFatal($"The config <{nameof(conf.HomeUrl)}> is required! HALT"); return; } var excludeUrlPrefix = conf.UrlPrefixExclude; var hostsInclude = new HashSet(conf.HostsInclude); var homeUri = new Uri(conf.HomeUrl); var homeHost = homeUri.Host.ToLower(); string[] ProcessLinkFilters(Uri pageUri, params string[] links) { var linkUris = links.Select(p => { try { var uri = new Uri(pageUri, p); if (string.IsNullOrEmpty(uri.Fragment)) return uri; //remove hash var uos = uri.ToString(); uos = uos.Substring(0, uos.IndexOf("#", StringComparison.Ordinal)); return new Uri(uos); } catch (Exception e) { LogWarning($"{e.Message} {p}"); return null; } }).Where(p => null != p).Distinct().ToArray(); var lstExPfx = new List(); var lstInHome = new List(); var lstInHosts = new List(); var lstExNon = new List(); foreach (var linkUri in linkUris) { if (excludeUrlPrefix.Any(p => linkUri.ToString().StartsWith(p))) { lstExPfx.Add(linkUri); continue; } if (linkUri.Host == homeHost) { lstInHome.Add(linkUri); continue; } if (hostsInclude.Contains(linkUri.Host)) { lstInHosts.Add(linkUri); continue; } lstExNon.Add(linkUri); } var pageUrl = pageUri.ToString(); void AddLinks(List items, bool included, string remark) { if (items.Any()) _db.BulkAddResourceLink(pageUrl, included, remark, items.Select(p => p.ToString()).ToArray()); } AddLinks(lstExPfx, false, "Exclude by url prefix"); AddLinks(lstExNon, false, "Exclude by non matched rulers"); AddLinks(lstInHome, true, "Include by home host"); AddLinks(lstInHosts, true, "Include by [HostsInclude]"); return lstInHome.Union(lstInHosts).Select(p => p.ToString()).ToArray(); } _db.CreateArchiveEntryIfNotExist(conf.HomeUrl); while (IsRunning) { var urls = _db.GetNonDumpedUrls(conf.Parallel); LogInfo($"Gets {urls.Length} url from db."); if (urls.Length == 0) break; var hssExtractedLinks = new HashSet(); ArchiveEntry PageProc(string url) { ArchiveEntry entry; Response resp; var uri = new Uri(url); LogTrace($"GET {uri}"); var referer = _db.GetReferer(url); LogTrace($" -> Referer: {referer}"); try { resp = Requester.GetHttp(url, referer); entry = new ArchiveEntry { Url = url, StatusCode = (int)resp.StatusCode, StatusDescription = resp.StatusDescription, Content = resp.Body, }; } catch (Exception e) { LogTrace($"GET {url} -- {e.Message}"); entry = new ArchiveEntry { Url = url, StatusCode = 544, StatusDescription = "ArchiverError", Headers = "content-type: text/plain; charset=utf-8", Content = Encoding.UTF8.GetBytes(e.ToString()) }; return entry; } LogTrace($"GET {url} -- {entry.StatusCode}|{resp.ContentType}|{entry.Content.Length}"); var listHeaders = new List { new HttpHeader("content-type", resp.ContentType) }; listHeaders.AddRange(resp.GetServerTimeHeaders()); if (resp.GetRedirect(out var redirectUrl)) { hssExtractedLinks.AddRange(ProcessLinkFilters(uri, redirectUrl)); listHeaders.Add(new HttpHeader("location", redirectUrl)); } else { var linksOnPage = new HashSet(); if (resp.GetHtmlDocument(out var doc)) linksOnPage.AddRange(LinkProcessor.ExtractLinks(doc).Distinct()); else if (resp.GetCss(out var css)) linksOnPage.AddRange(LinkProcessor.FromCss(css).Distinct()); linksOnPage.Remove("//"); hssExtractedLinks.AddRange(ProcessLinkFilters(uri, linksOnPage.ToArray())); } entry.Headers = listHeaders.ToStringLines(); return entry; } var results = urls .AsParallel().WithDegreeOfParallelism(conf.Parallel) .Select(PageProc) .ToArray(); LogInfo($"Saving extracted {hssExtractedLinks.Count} new link..."); var n = _db.BulkAddNewArchiveEntry(hssExtractedLinks); LogInfo($"Saved new {n} link..."); LogInfo($"Updating {results.Length} entry..."); var u = _db.BulkUpdateArchiveEntry(results); LogInfo($"Updated {u} entry..."); } IsRunning = false; LogInfo("Finished!"); } } }