-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDocumentLoader.cs
More file actions
83 lines (66 loc) · 2.18 KB
/
Copy pathDocumentLoader.cs
File metadata and controls
83 lines (66 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
using System.Text;
using UglyToad.PdfPig;
namespace GroupDynamic;
public static class DocumentLoader
{
static DocumentLoader()
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
}
public static LoadedDocument Load(string path)
{
string extension = Path.GetExtension(path).ToLowerInvariant();
if (extension == ".pdf")
{
return new LoadedDocument(LoadPdfText(path), true);
}
return new LoadedDocument(LoadTextFile(path), false);
}
private static string LoadTextFile(string path)
{
byte[] bytes = File.ReadAllBytes(path);
if (bytes.Length >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
{
return Encoding.UTF8.GetString(bytes[3..]);
}
if (bytes.Length >= 2)
{
if (bytes[0] == 0xFF && bytes[1] == 0xFE)
{
return Encoding.Unicode.GetString(bytes[2..]);
}
if (bytes[0] == 0xFE && bytes[1] == 0xFF)
{
return Encoding.BigEndianUnicode.GetString(bytes[2..]);
}
}
try
{
return new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true).GetString(bytes);
}
catch (DecoderFallbackException)
{
return Encoding.GetEncoding(1251).GetString(bytes);
}
}
private static string LoadPdfText(string path)
{
StringBuilder builder = new();
using PdfDocument document = PdfDocument.Open(path);
foreach (var page in document.GetPages())
{
string pageText = page.Text;
if (!string.IsNullOrWhiteSpace(pageText))
{
builder.AppendLine(pageText.Trim());
builder.AppendLine();
}
}
if (builder.Length == 0)
{
return "Текст из PDF извлечь не удалось. Возможно, документ состоит из сканов или изображений.";
}
return builder.ToString().TrimEnd();
}
}
public record LoadedDocument(string Text, bool IsPdf);