Crawler.cs 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. using Rac.Common;
  2. using Rac.Entities;
  3. using Rac.Models;
  4. using Rac.Tools;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Linq;
  8. using System.Text;
  9. using System.Threading;
  10. namespace Rac
  11. {
  12. //TODO: CHECK AFFECT OF `Error loading type Typespec 0x1b000044 from due to Could not resolve typespec token 1b000044'
  13. public class Crawler : BaseService
  14. {
  15. private readonly DataAccess _db;
  16. private Thread _runner;
  17. public bool IsRunning { get; private set; }
  18. public Crawler(string dbFilePath)
  19. {
  20. _db = new DataAccess(dbFilePath);
  21. }
  22. public override void Start()
  23. {
  24. LogInfo("Starting...");
  25. IsRunning = true;
  26. _runner = new Thread(RunInternal) { IsBackground = true };
  27. _runner.Start();
  28. LogInfo("Started.");
  29. }
  30. public override void Stop()
  31. {
  32. IsRunning = false;
  33. _runner.Join();
  34. LogInfo("Stopped.");
  35. }
  36. private void RunInternal()
  37. {
  38. LogDebug("Loading configs...");
  39. var conf = new ConfigAdapter(_db.GetConfigs());
  40. if (string.IsNullOrEmpty(conf.HomeUrl))
  41. {
  42. LogFatal($"The config <{nameof(conf.HomeUrl)}> is required! HALT");
  43. return;
  44. }
  45. var excludeUrlPrefix = conf.UrlPrefixExclude;
  46. var hostsInclude = new HashSet<string>(conf.HostsInclude);
  47. var homeUri = new Uri(conf.HomeUrl);
  48. var homeHost = homeUri.Host.ToLower();
  49. string[] ProcessLinkFilters(Uri pageUri, params string[] links)
  50. {
  51. var linkUris = links.Select(p =>
  52. {
  53. try
  54. {
  55. var uri = new Uri(pageUri, p);
  56. if (string.IsNullOrEmpty(uri.Fragment)) return uri;
  57. //remove hash
  58. var uos = uri.ToString();
  59. uos = uos.Substring(0, uos.IndexOf("#", StringComparison.Ordinal));
  60. return new Uri(uos);
  61. }
  62. catch (Exception e)
  63. {
  64. LogWarning($"{e.Message} {p}");
  65. return null;
  66. }
  67. }).Where(p => null != p).Distinct().ToArray();
  68. var lstExPfx = new List<Uri>();
  69. var lstInHome = new List<Uri>();
  70. var lstInHosts = new List<Uri>();
  71. var lstExNon = new List<Uri>();
  72. foreach (var linkUri in linkUris)
  73. {
  74. if (excludeUrlPrefix.Any(p => linkUri.ToString().StartsWith(p)))
  75. {
  76. lstExPfx.Add(linkUri);
  77. continue;
  78. }
  79. if (linkUri.Host == homeHost)
  80. {
  81. lstInHome.Add(linkUri);
  82. continue;
  83. }
  84. if (hostsInclude.Contains(linkUri.Host))
  85. {
  86. lstInHosts.Add(linkUri);
  87. continue;
  88. }
  89. lstExNon.Add(linkUri);
  90. }
  91. var pageUrl = pageUri.ToString();
  92. void AddLinks(List<Uri> items, bool included, string remark)
  93. {
  94. if (items.Any()) _db.BulkAddResourceLink(pageUrl, included, remark, items.Select(p => p.ToString()).ToArray());
  95. }
  96. AddLinks(lstExPfx, false, "Exclude by url prefix");
  97. AddLinks(lstExNon, false, "Exclude by non matched rulers");
  98. AddLinks(lstInHome, true, "Include by home host");
  99. AddLinks(lstInHosts, true, "Include by [HostsInclude]");
  100. return lstInHome.Union(lstInHosts).Select(p => p.ToString()).ToArray();
  101. }
  102. _db.CreateArchiveEntryIfNotExist(conf.HomeUrl);
  103. while (IsRunning)
  104. {
  105. var urls = _db.GetNonDumpedUrls(conf.Parallel);
  106. LogInfo($"Gets {urls.Length} url from db.");
  107. if (urls.Length == 0) break;
  108. var hssExtractedLinks = new HashSet<string>();
  109. ArchiveEntry PageProc(string url)
  110. {
  111. ArchiveEntry entry;
  112. Response resp;
  113. var uri = new Uri(url);
  114. LogTrace($"GET {uri}");
  115. var referer = _db.GetReferer(url);
  116. LogTrace($" -> Referer: {referer}");
  117. try
  118. {
  119. resp = Requester.GetHttp(url, referer);
  120. entry = new ArchiveEntry
  121. {
  122. Url = url,
  123. StatusCode = (int)resp.StatusCode,
  124. StatusDescription = resp.StatusDescription,
  125. Content = resp.Body,
  126. };
  127. }
  128. catch (Exception e)
  129. {
  130. LogTrace($"GET {url} -- {e.Message}");
  131. entry = new ArchiveEntry
  132. {
  133. Url = url,
  134. StatusCode = 544,
  135. StatusDescription = "ArchiverError",
  136. Headers = "content-type: text/plain; charset=utf-8",
  137. Content = Encoding.UTF8.GetBytes(e.ToString())
  138. };
  139. return entry;
  140. }
  141. LogTrace($"GET {url} -- {entry.StatusCode}|{resp.ContentType}|{entry.Content.Length}");
  142. var listHeaders = new List<HttpHeader> { new HttpHeader("content-type", resp.ContentType) };
  143. listHeaders.AddRange(resp.GetServerTimeHeaders());
  144. if (resp.GetRedirect(out var redirectUrl))
  145. {
  146. hssExtractedLinks.AddRange(ProcessLinkFilters(uri, redirectUrl));
  147. listHeaders.Add(new HttpHeader("location", redirectUrl));
  148. }
  149. else
  150. {
  151. var linksOnPage = new HashSet<string>();
  152. if (resp.GetHtmlDocument(out var doc)) linksOnPage.AddRange(LinkProcessor.ExtractLinks(doc).Distinct());
  153. else if (resp.GetCss(out var css)) linksOnPage.AddRange(LinkProcessor.FromCss(css).Distinct());
  154. linksOnPage.Remove("//");
  155. hssExtractedLinks.AddRange(ProcessLinkFilters(uri, linksOnPage.ToArray()));
  156. }
  157. entry.Headers = listHeaders.ToStringLines();
  158. return entry;
  159. }
  160. var results = urls
  161. .AsParallel().WithDegreeOfParallelism(conf.Parallel)
  162. .Select(PageProc)
  163. .ToArray();
  164. LogInfo($"Saving extracted {hssExtractedLinks.Count} new link...");
  165. var n = _db.BulkAddNewArchiveEntry(hssExtractedLinks);
  166. LogInfo($"Saved new {n} link...");
  167. LogInfo($"Updating {results.Length} entry...");
  168. var u = _db.BulkUpdateArchiveEntry(results);
  169. LogInfo($"Updated {u} entry...");
  170. }
  171. IsRunning = false;
  172. LogInfo("Finished!");
  173. }
  174. }
  175. }