5
/// <summary>Provides functions for dealing with textual data.</summary>
6
public static class Text {
9
// --- public functions ---
11
/// <summary>Splits a text at line boundaries and returns an array of lines.</summary>
12
/// <param name="text">The text.</param>
13
/// <returns>The array of lines in the text.</returns>
14
public static string[] SplitLines(string text) {
15
return text.Split(Newlines, StringSplitOptions.None);
18
/// <summary>Takes an array of bytes representing text, identifies the encoding of that text by the byte order mark, and returns the corresponding text.</summary>
19
/// <param name="data">The array of bytes representing the text.</param>
20
/// <param name="fallback">A fallback encoding in case the encoding cannot be determined.</param>
21
/// <returns>The text represented by the byte data.</returns>
22
/// <exception cref="System.NullReferenceException">Raised when the array of bytes is a null reference.</exception>
23
public static string GetTextFromBytes(byte[] data, Encoding fallback) {
25
if (GetEncodingFromBytes(data, out encoding)) {
26
return encoding.GetString(data);
28
return fallback.GetString(data);
32
/// <summary>Takes an array of bytes representing text, identifies the encoding of that text by the byte order mark, and returns an array of lines of the corresponding text.</summary>
33
/// <param name="data">The array of bytes representing the text.</param>
34
/// <param name="fallback">A fallback encoding in case the encoding cannot be determined.</param>
35
/// <returns>The array of lines in the text represented by the byte data.</returns>
36
/// <exception cref="System.NullReferenceException">Raised when the array of bytes is a null reference.</exception>
37
public static string[] GetLinesFromBytes(byte[] data, Encoding fallback) {
38
string text = GetTextFromBytes(data, fallback);
39
return SplitLines(text);
42
/// <summary>Takes a text file, identifies its encoding, and returns the corresponding text.</summary>
43
/// <param name="file">The text file.</param>
44
/// <param name="fallback">A fallback encoding in case the encoding cannot be determined.</param>
45
/// <returns>The text contained in the file.</returns>
46
public static string GetTextFromFile(string file, Encoding fallback) {
47
byte[] data = System.IO.File.ReadAllBytes(file);
48
return GetTextFromBytes(data, fallback);
51
/// <summary>Takes a text file, identifies its encoding, and returns an array of lines of the corresponding text.</summary>
52
/// <param name="file">The text file.</param>
53
/// <param name="fallback">A fallback encoding in case the encoding cannot be determined.</param>
54
/// <returns>The array of lines of the text contained in the file.</returns>
55
public static string[] GetLinesFromFile(string file, Encoding fallback) {
56
byte[] data = System.IO.File.ReadAllBytes(file);
57
return GetLinesFromBytes(data, fallback);
60
/// <summary>Takes an array of bytes representing text, identifies the encoding of that text by the byte order mark, and returns the corresponding encoding in an output parameter.</summary>
61
/// <param name="data">The byte data of the text.</param>
62
/// <param name="encoding">Receives the encoding on success.</param>
63
/// <returns>A boolean indicating whether a byte order mark was found and the matching encoding is available on this system.</returns>
64
/// <exception cref="System.NullReferenceException">Raised when the array of bytes is a null reference.</exception>
65
public static bool GetEncodingFromBytes(byte[] data, out Encoding encoding) {
66
if (data.Length >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
67
return TryGetEncoding("utf-8", out encoding);
68
} else if (data.Length >= 2 && data[0] == 0xFE && data[1] == 0xFF) {
69
return TryGetEncoding("unicodeFFFE", out encoding);
70
} else if (data.Length >= 2 && data[0] == 0xFF && data[1] == 0xFE) {
71
return TryGetEncoding("utf-16", out encoding);
72
} else if (data.Length >= 4 && data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF) {
73
return TryGetEncoding("utf-32BE", out encoding);
74
} else if (data.Length >= 4 && data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00) {
75
return TryGetEncoding("utf-32", out encoding);
76
} else if (data.Length >= 4 && data[0] == 0x2B && data[1] == 0x2F && data[2] == 0x76 && (data[3] == 0x38 || data[3] == 0x39 || data[3] == 0x2B || data[3] == 0x2F)) {
77
return TryGetEncoding("utf-7", out encoding);
78
} else if (data.Length >= 4 && data[0] == 0x84 && data[1] == 0x31 && data[2] == 0x95 && data[3] == 0x33) {
79
return TryGetEncoding("GB18030", out encoding);
87
// --- private fields and functions ---
89
/// <summary>Represents the set of Unicode newlines.</summary>
90
private static readonly string[] Newlines = new string[] { "\x0D\x0A", "\x0A", "\x0C", "\x0D", "\x85", "\u2028", "\u2029" };
92
/// <summary>Gets a specified encoding by name provided that the specified encoding is available on this system.</summary>
93
/// <param name="name">The name of the encoding.</param>
94
/// <param name="encoding">On success, receives the encoding.</param>
95
/// <returns>The success of the operation.</returns>
96
private static bool TryGetEncoding(string name, out Encoding encoding) {
98
encoding = Encoding.GetEncoding(name);
b'\\ No newline at end of file'