Crawler.cs 7.6 KB


  1. using Rac.Common;
  2. using Rac.Entities;
  3. using Rac.Models;
  4. using Rac.Tools;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Linq;
  8. using System.Text;
  9. using System.Threading;
  10. namespace Rac
  11. {
  12. //public enum CrawlTaskEventType
  13. //{
  14. // Started,
  15. // Stopped,
  16. //}
  17. //public class CrawlTaskEventArgs : EventArgs
  18. //{
  19. // public string Url { get; set; }
  20. // public CrawlTaskEventType Type { get; set; }
  21. //}
  22. //TODO: CHECK AFFECT OF `Error loading type Typespec 0x1b000044 from due to Could not resolve typespec token 1b000044'
  23. public class Crawler : BaseService
  24. {
  25. private readonly DataAccess _db;
  26. private Thread _runner;
  27. public bool IsRunning { get; private set; }
  28. public Crawler(string dbFilename)
  29. {
  30. _db = new DataAccess(dbFilename);
  31. }
  32. public override void Start()
  33. {
  34. LogInfo("Starting...");
  35. IsRunning = true;
  36. _runner = new Thread(RunInternal) { IsBackground = true };
  37. _runner.Start();
  38. LogInfo("Started.");
  39. }
  40. public override void Stop()
  41. {
  42. IsRunning = false;
  43. _runner.Join();
  44. LogInfo("Stopped.");
  45. }
  46. private void RunInternal()
  47. {
  48. LogDebug("Loading configs...");
  49. var conf = new ConfigAdapter(_db.GetConfigs());
  50. if (string.IsNullOrEmpty(conf.HomeUrl))
  51. {
  52. LogFatal($"The config <{nameof(conf.HomeUrl)}> is required! HALT");
  53. return;
  54. }
  55. var excludeUrlPrefix = conf.UrlPrefixExclude;
  56. var hostsInclude = new HashSet<string>(conf.HostsInclude);
  57. var homeUri = new Uri(conf.HomeUrl);
  58. var homeHost = homeUri.Host.ToLower();
  59. string[] ProcessLinkFilters(Uri pageUri, params string[] links)
  60. {
  61. var linkUris = links.Select(p =>
  62. {
  63. try
  64. {
  65. var uri = new Uri(pageUri, p);
  66. if (string.IsNullOrEmpty(uri.Fragment)) return uri;
  67. //remove hash
  68. var uos = uri.ToString();
  69. uos = uos.Substring(0, uos.IndexOf("#", StringComparison.Ordinal));
  70. return new Uri(uos);
  71. }
  72. catch (Exception e)
  73. {
  74. LogWarning($"{e.Message} {p}");
  75. return null;
  76. }
  77. }).Where(p => null != p).Distinct().ToArray();
  78. var lstExPfx = new List<Uri>();
  79. var lstInHome = new List<Uri>();
  80. var lstInHosts = new List<Uri>();
  81. var lstExNon = new List<Uri>();
  82. foreach (var linkUri in linkUris)
  83. {
  84. if (excludeUrlPrefix.Any(p => linkUri.ToString().StartsWith(p)))
  85. {
  86. lstExPfx.Add(linkUri);
  87. continue;
  88. }
  89. if (linkUri.Host == homeHost)
  90. {
  91. lstInHome.Add(linkUri);
  92. continue;
  93. }
  94. if (hostsInclude.Contains(linkUri.Host))
  95. {
  96. lstInHosts.Add(linkUri);
  97. continue;
  98. }
  99. lstExNon.Add(linkUri);
  100. }
  101. var pageUrl = pageUri.ToString();
  102. void AddLinks(List<Uri> items, bool included, string remark)
  103. {
  104. if (items.Any()) _db.BulkAddResourceLink(pageUrl, included, remark, items.Select(p => p.ToString()).ToArray());
  105. }
  106. AddLinks(lstExPfx, false, "Exclude by url prefix");
  107. AddLinks(lstInHome, true, "Include by home host");
  108. AddLinks(lstInHosts, false, "Exclude by url prefix");
  109. AddLinks(lstExNon, false, "Exclude by non matched rulers");
  110. return lstInHome.Union(lstInHosts).Select(p => p.ToString()).ToArray();
  111. }
  112. _db.CreateArchiveEntryIfNotExist(conf.HomeUrl);
  113. while (IsRunning)
  114. {
  115. var urls = _db.GetNonDumpedUrls(conf.Parallel);
  116. LogInfo($"Gets {urls.Length} url from db.");
  117. if (urls.Length == 0) break;
  118. var hssExtractedLinks = new HashSet<string>();
  119. ArchiveEntry PageProce(string url)
  120. {
  121. ArchiveEntry entry;
  122. Response resp;
  123. var uri = new Uri(url);
  124. LogTrace($"GET {uri}");
  125. try
  126. {
  127. resp = Requester.GetHttp(url);
  128. entry = new ArchiveEntry
  129. {
  130. Url = url,
  131. StatusCode = (int)resp.StatusCode,
  132. StatusDescription = resp.StatusDescription,
  133. Content = resp.Body,
  134. };
  135. }
  136. catch (Exception e)
  137. {
  138. LogTrace($"GET {url} -- {e.Message}");
  139. entry = new ArchiveEntry
  140. {
  141. Url = url,
  142. StatusCode = 544,
  143. StatusDescription = "ArchiverError",
  144. Headers = "content-type: text/plain; charset=utf-8",
  145. Content = Encoding.UTF8.GetBytes(e.ToString())
  146. };
  147. return entry;
  148. }
  149. LogTrace($"GET {url} -- {entry.StatusCode}|{resp.ContentType}|{entry.Content.Length}");
  150. var listHeaders = new List<HttpHeader> { new HttpHeader("content-type", resp.ContentType) };
  151. listHeaders.AddRange(resp.GetServerTimeHeaders());
  152. if (resp.GetRedirect(out var redirectUrl))
  153. {
  154. hssExtractedLinks.AddRange(ProcessLinkFilters(uri, redirectUrl));
  155. listHeaders.Add(new HttpHeader("location", redirectUrl));
  156. }
  157. else
  158. {
  159. var linksOnPage = new HashSet<string>();
  160. if (resp.GetHtmlDocument(out var doc)) linksOnPage.AddRange(LinkProcessor.ExtractLinks(doc).Distinct());
  161. else if (resp.GetCss(out var css)) linksOnPage.AddRange(LinkProcessor.FromCss(css).Distinct());
  162. linksOnPage.Remove("//");
  163. hssExtractedLinks.AddRange(ProcessLinkFilters(uri, linksOnPage.ToArray()));
  164. }
  165. entry.Headers = listHeaders.ToStringLines();
  166. return entry;
  167. }
  168. var results = urls
  169. #if !DEBUG
  170. .AsParallel().WithDegreeOfParallelism(conf.Parallel)
  171. #endif
  172. .Select(PageProce)
  173. .ToArray();
  174. LogInfo($"Saving extracted {hssExtractedLinks.Count} new link...");
  175. var n = _db.BulkAddNewArchiveEntry(hssExtractedLinks);
  176. LogInfo($"Saved new {n} link...");
  177. LogInfo($"Updating {results.Length} entry...");
  178. var u = _db.BulkUpdateArchiveEntry(results);
  179. LogInfo($"Updated {u} entry...");
  180. }
  181. IsRunning = false;
  182. LogInfo("Finished!");
  183. }
  184. }
  185. }