Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * ucs.c - Universal Character Set processing
4 : */
5 :
6 : #include <linux/array_size.h>
7 : #include <linux/bsearch.h>
8 : #include <linux/consolemap.h>
9 : #include <linux/minmax.h>
10 :
11 : struct ucs_interval16 {
12 : u16 first;
13 : u16 last;
14 : };
15 :
16 : struct ucs_interval32 {
17 : u32 first;
18 : u32 last;
19 : };
20 :
21 : #include "ucs_width_table.h"
22 :
23 0 : static int interval16_cmp(const void *key, const void *element)
24 : {
25 0 : u16 cp = *(u16 *)key;
26 0 : const struct ucs_interval16 *entry = element;
27 :
28 0 : if (cp < entry->first)
29 0 : return -1;
30 0 : if (cp > entry->last)
31 0 : return 1;
32 0 : return 0;
33 0 : }
34 :
35 0 : static int interval32_cmp(const void *key, const void *element)
36 : {
37 0 : u32 cp = *(u32 *)key;
38 0 : const struct ucs_interval32 *entry = element;
39 :
40 0 : if (cp < entry->first)
41 0 : return -1;
42 0 : if (cp > entry->last)
43 0 : return 1;
44 0 : return 0;
45 0 : }
46 :
47 0 : static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size)
48 : {
49 0 : if (cp < ranges[0].first || cp > ranges[size - 1].last)
50 0 : return false;
51 :
52 0 : return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
53 0 : interval16_cmp) != NULL;
54 0 : }
55 :
56 0 : static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size)
57 : {
58 0 : if (cp < ranges[0].first || cp > ranges[size - 1].last)
59 0 : return false;
60 :
61 0 : return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
62 0 : interval32_cmp) != NULL;
63 0 : }
64 :
65 : #define UCS_IS_BMP(cp) ((cp) <= 0xffff)
66 :
67 : /**
68 : * ucs_is_zero_width() - Determine if a Unicode code point is zero-width.
69 : * @cp: Unicode code point (UCS-4)
70 : *
71 : * Return: true if the character is zero-width, false otherwise
72 : */
73 0 : bool ucs_is_zero_width(u32 cp)
74 : {
75 0 : if (UCS_IS_BMP(cp))
76 0 : return cp_in_range16(cp, ucs_zero_width_bmp_ranges,
77 : ARRAY_SIZE(ucs_zero_width_bmp_ranges));
78 : else
79 0 : return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges,
80 : ARRAY_SIZE(ucs_zero_width_non_bmp_ranges));
81 0 : }
82 :
83 : /**
84 : * ucs_is_double_width() - Determine if a Unicode code point is double-width.
85 : * @cp: Unicode code point (UCS-4)
86 : *
87 : * Return: true if the character is double-width, false otherwise
88 : */
89 0 : bool ucs_is_double_width(u32 cp)
90 : {
91 0 : if (UCS_IS_BMP(cp))
92 0 : return cp_in_range16(cp, ucs_double_width_bmp_ranges,
93 : ARRAY_SIZE(ucs_double_width_bmp_ranges));
94 : else
95 0 : return cp_in_range32(cp, ucs_double_width_non_bmp_ranges,
96 : ARRAY_SIZE(ucs_double_width_non_bmp_ranges));
97 0 : }
98 :
99 : /*
100 : * Structure for base with combining mark pairs and resulting recompositions.
101 : * Using u16 to save space since all values are within BMP range.
102 : */
103 : struct ucs_recomposition {
104 : u16 base; /* base character */
105 : u16 mark; /* combining mark */
106 : u16 recomposed; /* corresponding recomposed character */
107 : };
108 :
109 : #include "ucs_recompose_table.h"
110 :
111 : struct compare_key {
112 : u16 base;
113 : u16 mark;
114 : };
115 :
116 0 : static int recomposition_cmp(const void *key, const void *element)
117 : {
118 0 : const struct compare_key *search_key = key;
119 0 : const struct ucs_recomposition *entry = element;
120 :
121 : /* Compare base character first */
122 0 : if (search_key->base < entry->base)
123 0 : return -1;
124 0 : if (search_key->base > entry->base)
125 0 : return 1;
126 :
127 : /* Base characters match, now compare combining character */
128 0 : if (search_key->mark < entry->mark)
129 0 : return -1;
130 0 : if (search_key->mark > entry->mark)
131 0 : return 1;
132 :
133 : /* Both match */
134 0 : return 0;
135 0 : }
136 :
137 : /**
138 : * ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
139 : * @base: Base Unicode code point (UCS-4)
140 : * @mark: Combining mark Unicode code point (UCS-4)
141 : *
142 : * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
143 : */
144 0 : u32 ucs_recompose(u32 base, u32 mark)
145 : {
146 : /* Check if characters are within the range of our table */
147 0 : if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
148 0 : mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
149 0 : return 0;
150 :
151 0 : struct compare_key key = { base, mark };
152 0 : struct ucs_recomposition *result =
153 0 : __inline_bsearch(&key, ucs_recomposition_table,
154 : ARRAY_SIZE(ucs_recomposition_table),
155 : sizeof(*ucs_recomposition_table),
156 : recomposition_cmp);
157 :
158 0 : return result ? result->recomposed : 0;
159 0 : }
160 :
161 : /*
162 : * The fallback table structures implement a 2-level lookup.
163 : */
164 :
165 : struct ucs_page_desc {
166 : u8 page; /* Page index (high byte of code points) */
167 : u8 count; /* Number of entries in this page */
168 : u16 start; /* Start index in entries array */
169 : };
170 :
171 : struct ucs_page_entry {
172 : u8 offset; /* Offset within page (0-255) */
173 : u8 fallback; /* Fallback character or range start marker */
174 : };
175 :
176 : #include "ucs_fallback_table.h"
177 :
178 0 : static int ucs_page_desc_cmp(const void *key, const void *element)
179 : {
180 0 : u8 page = *(u8 *)key;
181 0 : const struct ucs_page_desc *entry = element;
182 :
183 0 : if (page < entry->page)
184 0 : return -1;
185 0 : if (page > entry->page)
186 0 : return 1;
187 0 : return 0;
188 0 : }
189 :
190 0 : static int ucs_page_entry_cmp(const void *key, const void *element)
191 : {
192 0 : u8 offset = *(u8 *)key;
193 0 : const struct ucs_page_entry *entry = element;
194 :
195 0 : if (offset < entry->offset)
196 0 : return -1;
197 0 : if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
198 0 : if (offset > entry[1].offset)
199 0 : return 1;
200 0 : } else {
201 0 : if (offset > entry->offset)
202 0 : return 1;
203 : }
204 0 : return 0;
205 0 : }
206 :
207 : /**
208 : * ucs_get_fallback() - Get a substitution for the provided Unicode character
209 : * @cp: Unicode code point (UCS-4)
210 : *
211 : * Get a simpler fallback character for the provided Unicode character.
212 : * This is used for terminal display when corresponding glyph is unavailable.
213 : * The substitution may not be as good as the actual glyph for the original
214 : * character but still way more helpful than a squared question mark.
215 : *
216 : * Return: Fallback Unicode code point, or 0 if none is available
217 : */
218 0 : u32 ucs_get_fallback(u32 cp)
219 : {
220 0 : const struct ucs_page_desc *page;
221 0 : const struct ucs_page_entry *entry;
222 0 : u8 page_idx = cp >> 8, offset = cp;
223 :
224 0 : if (!UCS_IS_BMP(cp))
225 0 : return 0;
226 :
227 : /*
228 : * Full-width to ASCII mapping (covering all printable ASCII 33-126)
229 : * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
230 : * We process them programmatically to reduce the table size.
231 : */
232 0 : if (cp >= 0xFF01 && cp <= 0xFF5E)
233 0 : return cp - 0xFF01 + 33;
234 :
235 0 : page = __inline_bsearch(&page_idx, ucs_fallback_pages,
236 : ARRAY_SIZE(ucs_fallback_pages),
237 : sizeof(*ucs_fallback_pages),
238 : ucs_page_desc_cmp);
239 0 : if (!page)
240 0 : return 0;
241 :
242 0 : entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
243 0 : page->count, sizeof(*ucs_fallback_entries),
244 : ucs_page_entry_cmp);
245 0 : if (!entry)
246 0 : return 0;
247 :
248 0 : if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
249 0 : entry++;
250 0 : return entry->fallback;
251 0 : }
|