WarcParser.cs 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. using System.Text;
  2. namespace WarcViewerBlazorWinForm.Backend.Warc
  3. {
  4. internal static class WarcParser
  5. {
  6. public const string WARC_ENTRY_HEADER_PREFIX = "WARC/";
  7. public static byte[] ReadEntryBlock(Stream inputStream)
  8. {
  9. var contentLength = inputStream.ReadWarcEntryAndExtractContentLengthAndOutBytes(out var headerBytes);
  10. if (contentLength == -1) return null;
  11. var entry = inputStream.ReadBlock(contentLength + 4); // + \r\n \r\n
  12. var returnBuf = new byte[contentLength + headerBytes!.Length + 4];
  13. Buffer.BlockCopy(headerBytes, 0, returnBuf, 0, headerBytes.Length);
  14. Buffer.BlockCopy(entry.Array!, entry.Offset, returnBuf, headerBytes.Length, entry.Count);
  15. return returnBuf;
  16. }
  17. public static WarcEntry ParseBlockForIndexing(byte[] block)
  18. {
  19. var ms = new MemoryStream(block);
  20. var lines = Encoding.ASCII.GetString(ms.ReadLinesUntilBlankLineBytes()!).Split("\r\n");
  21. var entry = new WarcEntry();
  22. foreach (var line in lines.Skip(1))
  23. {
  24. var parts = line.Split(':', 2);
  25. if (parts.Length != 2) continue;
  26. switch (parts[0])
  27. {
  28. case "WARC-Type":
  29. entry.Type = parts[1].Trim();
  30. break;
  31. case "WARC-Target-URI":
  32. entry.Url = parts[1].Trim();
  33. break;
  34. }
  35. }
  36. return entry;
  37. }
  38. public static WarcEntry ParseBlockDetail(byte[] block)
  39. {
  40. //TODO: WarcEntry Parse
  41. throw new NotImplementedException();
  42. }
  43. private static int ReadWarcEntryAndExtractContentLengthAndOutBytes(this Stream stream, out byte[]? bytes)
  44. {
  45. bytes = stream.ReadLinesUntilBlankLineBytes();
  46. if (bytes == null) return -1;
  47. var lines = Encoding.ASCII.GetString(bytes).Split("\r\n");
  48. if (lines.Length < 2) throw new InvalidDataException("Insufficient header lines");
  49. if (lines[0].StartsWith(WARC_ENTRY_HEADER_PREFIX) == false) throw new Exception("WARC entry header mismatch:" + lines[0]);
  50. return ExtractContentLengthFromLines(lines.Skip(1));
  51. }
  52. private static int ExtractContentLengthFromLines(IEnumerable<string> lines)
  53. {
  54. foreach (var line in lines)
  55. {
  56. var nameValue = line.Split(':', 2);
  57. if (nameValue.Length != 2) throw new InvalidDataException("Invalid header line:" + line);
  58. var name = nameValue[0];
  59. var value = nameValue[1].Trim();
  60. var lowerName = name.ToLower();
  61. if (lowerName == "content-length" && long.TryParse(value, out var contentLength))
  62. {
  63. return (int)contentLength;
  64. }
  65. }
  66. throw new Exception("Missing content-length in WARC headers");
  67. }
  68. private static ArraySegment<byte> ReadBlock(this Stream stream, int count)
  69. {
  70. if (stream == null) throw new ArgumentNullException(nameof(stream));
  71. var buffer = new byte[count];
  72. var bytesRead = 0;
  73. while (bytesRead < count)
  74. {
  75. var read = stream.Read(buffer, bytesRead, count - bytesRead);
  76. // 已到达流的末尾
  77. if (read == 0) break;
  78. bytesRead += read;
  79. }
  80. if (bytesRead < count) throw new EndOfStreamException($"Unexpected end of stream. Expected to read {count} bytes, but only read {bytesRead} bytes.");
  81. return buffer;
  82. }
  83. private static byte[]? ReadLinesUntilBlankLineBytes(this Stream stream)
  84. {
  85. var ms = new MemoryStream(128 * 32);
  86. do
  87. {
  88. var line = stream.ReadLineBytes();
  89. if (line == null) return null;
  90. ms.Write(line);
  91. ms.WriteByte((byte)'\r');
  92. ms.WriteByte((byte)'\n');
  93. if (line.Length == 0) break;
  94. } while (true);
  95. return ms.ToArray();
  96. }
  97. private static byte[]? ReadLineBytes(this Stream stream)
  98. {
  99. var line = new MemoryStream(128);
  100. int currentByte;
  101. while ((currentByte = stream.ReadByte()) != -1)
  102. {
  103. var charValue = (byte)currentByte;
  104. if (charValue == (byte)'\r')
  105. {
  106. // 遇到回车符,继续读取下一个字符
  107. var nextChar = stream.ReadByte();
  108. if (nextChar == -1)
  109. {
  110. // 遇到文件末尾
  111. break;
  112. }
  113. var nextCharValue = (byte)nextChar;
  114. if (nextCharValue == (byte)'\n')
  115. {
  116. // 遇到换行符,结束读取
  117. break;
  118. }
  119. // 回车符后面不是换行符,将回车符添加到行内容
  120. line.WriteByte(charValue);
  121. line.WriteByte(nextCharValue);
  122. }
  123. else if (charValue == '\n')
  124. {
  125. // 遇到换行符,结束读取
  126. break;
  127. }
  128. else
  129. {
  130. // 其他字符,添加到行内容
  131. line.WriteByte(charValue);
  132. }
  133. }
  134. if (line.Length == 0 && currentByte == -1)
  135. {
  136. return null; // 已到达流的末尾
  137. }
  138. return line.ToArray();
  139. }
  140. }
  141. }