123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- using Rac.Common;
- using Rac.Entities;
- using Rac.Models;
- using Rac.Tools;
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Threading;
- namespace Rac
- {
- //TODO: CHECK AFFECT OF `Error loading type Typespec 0x1b000044 from due to Could not resolve typespec token 1b000044'
- public class Crawler : BaseService
- {
- private readonly DataAccess _db;
- private Thread _runner;
- public bool IsRunning { get; private set; }
- public Crawler(string dbFilePath)
- {
- _db = new DataAccess(dbFilePath);
- }
- public override void Start()
- {
- LogInfo("Starting...");
- IsRunning = true;
- _runner = new Thread(RunInternal) { IsBackground = true };
- _runner.Start();
- LogInfo("Started.");
- }
- public override void Stop()
- {
- IsRunning = false;
- _runner.Join();
- LogInfo("Stopped.");
- }
- private void RunInternal()
- {
- LogDebug("Loading configs...");
- var conf = new ConfigAdapter(_db.GetConfigs());
- if (string.IsNullOrEmpty(conf.HomeUrl))
- {
- LogFatal($"The config <{nameof(conf.HomeUrl)}> is required! HALT");
- return;
- }
- var excludeUrlPrefix = conf.UrlPrefixExclude;
- var hostsInclude = new HashSet<string>(conf.HostsInclude);
- var homeUri = new Uri(conf.HomeUrl);
- var homeHost = homeUri.Host.ToLower();
- string[] ProcessLinkFilters(Uri pageUri, params string[] links)
- {
- var linkUris = links.Select(p =>
- {
- try
- {
- var uri = new Uri(pageUri, p);
- if (string.IsNullOrEmpty(uri.Fragment)) return uri;
- //remove hash
- var uos = uri.ToString();
- uos = uos.Substring(0, uos.IndexOf("#", StringComparison.Ordinal));
- return new Uri(uos);
- }
- catch (Exception e)
- {
- LogWarning($"{e.Message} {p}");
- return null;
- }
- }).Where(p => null != p).Distinct().ToArray();
- var lstExPfx = new List<Uri>();
- var lstInHome = new List<Uri>();
- var lstInHosts = new List<Uri>();
- var lstExNon = new List<Uri>();
- foreach (var linkUri in linkUris)
- {
- if (excludeUrlPrefix.Any(p => linkUri.ToString().StartsWith(p)))
- {
- lstExPfx.Add(linkUri);
- continue;
- }
- if (linkUri.Host == homeHost)
- {
- lstInHome.Add(linkUri);
- continue;
- }
- if (hostsInclude.Contains(linkUri.Host))
- {
- lstInHosts.Add(linkUri);
- continue;
- }
- lstExNon.Add(linkUri);
- }
- var pageUrl = pageUri.ToString();
- void AddLinks(List<Uri> items, bool included, string remark)
- {
- if (items.Any()) _db.BulkAddResourceLink(pageUrl, included, remark, items.Select(p => p.ToString()).ToArray());
- }
- AddLinks(lstExPfx, false, "Exclude by url prefix");
- AddLinks(lstExNon, false, "Exclude by non matched rulers");
- AddLinks(lstInHome, true, "Include by home host");
- AddLinks(lstInHosts, true, "Include by [HostsInclude]");
- return lstInHome.Union(lstInHosts).Select(p => p.ToString()).ToArray();
- }
- _db.CreateArchiveEntryIfNotExist(conf.HomeUrl);
- while (IsRunning)
- {
- var urls = _db.GetNonDumpedUrls(conf.Parallel);
- LogInfo($"Gets {urls.Length} url from db.");
- if (urls.Length == 0) break;
- var hssExtractedLinks = new HashSet<string>();
- ArchiveEntry PageProc(string url)
- {
- ArchiveEntry entry;
- Response resp;
- var uri = new Uri(url);
- LogTrace($"GET {uri}");
- var referer = _db.GetReferer(url);
- LogTrace($" -> Referer: {referer}");
- try
- {
- resp = Requester.GetHttp(url, referer);
- entry = new ArchiveEntry
- {
- Url = url,
- StatusCode = (int)resp.StatusCode,
- StatusDescription = resp.StatusDescription,
- Content = resp.Body,
- };
- }
- catch (Exception e)
- {
- LogTrace($"GET {url} -- {e.Message}");
- entry = new ArchiveEntry
- {
- Url = url,
- StatusCode = 544,
- StatusDescription = "ArchiverError",
- Headers = "content-type: text/plain; charset=utf-8",
- Content = Encoding.UTF8.GetBytes(e.ToString())
- };
- return entry;
- }
- LogTrace($"GET {url} -- {entry.StatusCode}|{resp.ContentType}|{entry.Content.Length}");
- var listHeaders = new List<HttpHeader> { new HttpHeader("content-type", resp.ContentType) };
- listHeaders.AddRange(resp.GetServerTimeHeaders());
- if (resp.GetRedirect(out var redirectUrl))
- {
- hssExtractedLinks.AddRange(ProcessLinkFilters(uri, redirectUrl));
- listHeaders.Add(new HttpHeader("location", redirectUrl));
- }
- else
- {
- var linksOnPage = new HashSet<string>();
- if (resp.GetHtmlDocument(out var doc)) linksOnPage.AddRange(LinkProcessor.ExtractLinks(doc).Distinct());
- else if (resp.GetCss(out var css)) linksOnPage.AddRange(LinkProcessor.FromCss(css).Distinct());
- linksOnPage.Remove("//");
- hssExtractedLinks.AddRange(ProcessLinkFilters(uri, linksOnPage.ToArray()));
- }
- entry.Headers = listHeaders.ToStringLines();
- return entry;
- }
- var results = urls
- .AsParallel().WithDegreeOfParallelism(conf.Parallel)
- .Select(PageProc)
- .ToArray();
- LogInfo($"Saving extracted {hssExtractedLinks.Count} new link...");
- var n = _db.BulkAddNewArchiveEntry(hssExtractedLinks);
- LogInfo($"Saved new {n} link...");
- LogInfo($"Updating {results.Length} entry...");
- var u = _db.BulkUpdateArchiveEntry(results);
- LogInfo($"Updated {u} entry...");
- }
- IsRunning = false;
- LogInfo("Finished!");
- }
- }
- }
|