123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- using System.Text;
- namespace WarcViewerBlazorWinForm.Backend.Warc
- {
- internal static class WarcParser
- {
- public const string WARC_ENTRY_HEADER_PREFIX = "WARC/";
- public static byte[] ReadEntryBlock(Stream inputStream)
- {
- var contentLength = inputStream.ReadWarcEntryAndExtractContentLengthAndOutBytes(out var headerBytes);
- if (contentLength == -1) return null;
- var entry = inputStream.ReadBlock(contentLength + 4); // + \r\n \r\n
- var returnBuf = new byte[contentLength + headerBytes!.Length + 4];
- Buffer.BlockCopy(headerBytes, 0, returnBuf, 0, headerBytes.Length);
- Buffer.BlockCopy(entry.Array!, entry.Offset, returnBuf, headerBytes.Length, entry.Count);
- return returnBuf;
- }
- public static WarcEntry ParseBlockForIndexing(byte[] block)
- {
- var ms = new MemoryStream(block);
- var lines = Encoding.ASCII.GetString(ms.ReadLinesUntilBlankLineBytes()!).Split("\r\n");
- var entry = new WarcEntry();
- foreach (var line in lines.Skip(1))
- {
- var parts = line.Split(':', 2);
- if (parts.Length != 2) continue;
- switch (parts[0])
- {
- case "WARC-Type":
- entry.Type = parts[1].Trim();
- break;
- case "WARC-Target-URI":
- entry.Url = parts[1].Trim();
- break;
- }
- }
- return entry;
- }
- public static WarcEntry ParseBlockDetail(byte[] block)
- {
- //TODO: WarcEntry Parse
- throw new NotImplementedException();
- }
- private static int ReadWarcEntryAndExtractContentLengthAndOutBytes(this Stream stream, out byte[]? bytes)
- {
- bytes = stream.ReadLinesUntilBlankLineBytes();
- if (bytes == null) return -1;
- var lines = Encoding.ASCII.GetString(bytes).Split("\r\n");
- if (lines.Length < 2) throw new InvalidDataException("Insufficient header lines");
- if (lines[0].StartsWith(WARC_ENTRY_HEADER_PREFIX) == false) throw new Exception("WARC entry header mismatch:" + lines[0]);
- return ExtractContentLengthFromLines(lines.Skip(1));
- }
- private static int ExtractContentLengthFromLines(IEnumerable<string> lines)
- {
- foreach (var line in lines)
- {
- var nameValue = line.Split(':', 2);
- if (nameValue.Length != 2) throw new InvalidDataException("Invalid header line:" + line);
- var name = nameValue[0];
- var value = nameValue[1].Trim();
- var lowerName = name.ToLower();
- if (lowerName == "content-length" && long.TryParse(value, out var contentLength))
- {
- return (int)contentLength;
- }
- }
- throw new Exception("Missing content-length in WARC headers");
- }
- private static ArraySegment<byte> ReadBlock(this Stream stream, int count)
- {
- if (stream == null) throw new ArgumentNullException(nameof(stream));
- var buffer = new byte[count];
- var bytesRead = 0;
- while (bytesRead < count)
- {
- var read = stream.Read(buffer, bytesRead, count - bytesRead);
- // 已到达流的末尾
- if (read == 0) break;
- bytesRead += read;
- }
- if (bytesRead < count) throw new EndOfStreamException($"Unexpected end of stream. Expected to read {count} bytes, but only read {bytesRead} bytes.");
- return buffer;
- }
- private static byte[]? ReadLinesUntilBlankLineBytes(this Stream stream)
- {
- var ms = new MemoryStream(128 * 32);
- do
- {
- var line = stream.ReadLineBytes();
- if (line == null) return null;
- ms.Write(line);
- ms.WriteByte((byte)'\r');
- ms.WriteByte((byte)'\n');
- if (line.Length == 0) break;
- } while (true);
- return ms.ToArray();
- }
- private static byte[]? ReadLineBytes(this Stream stream)
- {
- var line = new MemoryStream(128);
- int currentByte;
- while ((currentByte = stream.ReadByte()) != -1)
- {
- var charValue = (byte)currentByte;
- if (charValue == (byte)'\r')
- {
- // 遇到回车符,继续读取下一个字符
- var nextChar = stream.ReadByte();
- if (nextChar == -1)
- {
- // 遇到文件末尾
- break;
- }
- var nextCharValue = (byte)nextChar;
- if (nextCharValue == (byte)'\n')
- {
- // 遇到换行符,结束读取
- break;
- }
- // 回车符后面不是换行符,将回车符添加到行内容
- line.WriteByte(charValue);
- line.WriteByte(nextCharValue);
- }
- else if (charValue == '\n')
- {
- // 遇到换行符,结束读取
- break;
- }
- else
- {
- // 其他字符,添加到行内容
- line.WriteByte(charValue);
- }
- }
- if (line.Length == 0 && currentByte == -1)
- {
- return null; // 已到达流的末尾
- }
- return line.ToArray();
- }
- }
- }
|