GNU Unifont  15.0.05
Pan-Unicode font with complete Unicode Plane 0 coverage and partial coverage of higher planes
unigenwidth.c
Go to the documentation of this file.
1 /**
2  @file unigenwidth.c
3 
4  @brief unigenwidth - IEEE 1003.1-2008 setup to calculate
5  wchar_t string widths
6 
7  @author Paul Hardy.
8 
9  @copyright Copyright (C) 2013, 2017 Paul Hardy.
10 
11  All glyphs are treated as 16 pixels high, and can be
12  8, 16, 24, or 32 pixels wide (resulting in widths of
13  1, 2, 3, or 4, respectively).
14 */
15 /*
16  LICENSE:
17 
18  This program is free software: you can redistribute it and/or modify
19  it under the terms of the GNU General Public License as published by
20  the Free Software Foundation, either version 2 of the License, or
21  (at your option) any later version.
22 
23  This program is distributed in the hope that it will be useful,
24  but WITHOUT ANY WARRANTY; without even the implied warranty of
25  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  GNU General Public License for more details.
27 
28  You should have received a copy of the GNU General Public License
29  along with this program. If not, see <http://www.gnu.org/licenses/>.
30 */
31 
32 /*
33  20 June 2017 [Paul Hardy]:
34  - Now handles glyphs that are 24 or 32 pixels wide.
35 
36  8 July 2017 [Paul Hardy]:
37  - Modifies sscanf format strings to ignore second field after
38  the ":" field separator, newly added to "*combining.txt" files
39  and already present in "*.hex" files.
40 */
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 
46 #define MAXSTRING 256 ///< Maximum input line length - 1.
47 
48 /* Definitions for Pikto in Plane 15 */
49 #define PIKTO_START 0x0F0E70 ///< Start of Pikto code point range.
50 #define PIKTO_END 0x0F11EF ///< End of Pikto code point range.
51 /** Number of code points in Pikto range. */
52 #define PIKTO_SIZE (PIKTO_END - PIKTO_START + 1)
53 
54 
55 /**
56  @brief The main function.
57 
58  @param[in] argc The count of command line arguments.
59  @param[in] argv Pointer to array of command line arguments.
60  @return This program exits with status EXIT_SUCCESS.
61 */
62 int
63 main (int argc, char **argv)
64 {
65 
66  int i; /* loop variable */
67 
68  char teststring[MAXSTRING];
69  int loc;
70  char *gstart;
71 
72  char glyph_width[0x20000];
73  char pikto_width[PIKTO_SIZE];
74 
75  FILE *infilefp;
76 
77  if (argc != 3) {
78  fprintf (stderr, "\n\nUsage: %s <unifont.hex> <combining.txt>\n\n", argv[0]);
79  exit (EXIT_FAILURE);
80  }
81 
82  /*
83  Read the collection of hex glyphs.
84  */
85  if ((infilefp = fopen (argv[1],"r")) == NULL) {
86  fprintf (stderr,"ERROR - hex input file %s not found.\n\n", argv[1]);
87  exit (EXIT_FAILURE);
88  }
89 
90  /* Flag glyph as non-existent until found. */
91  memset (glyph_width, -1, 0x20000 * sizeof (char));
92  memset (pikto_width, -1, (PIKTO_SIZE) * sizeof (char));
93 
94  teststring[MAXSTRING-1] = '\0';
95  while (fgets (teststring, MAXSTRING-1, infilefp) != NULL) {
96  sscanf (teststring, "%X:%*s", &loc);
97  if (loc < 0x20000) {
98  gstart = strchr (teststring,':') + 1;
99  /*
100  16 rows per glyph, 2 ASCII hexadecimal digits per byte,
101  so divide number of digits by 32 (shift right 5 bits).
102  */
103  glyph_width[loc] = (strlen (gstart) - 1) >> 5;
104  }
105  else if ((loc >= PIKTO_START) && (loc <= PIKTO_END)) {
106  gstart = strchr (teststring,':') + 1;
107  pikto_width[loc - PIKTO_START] = strlen (gstart) <= 34 ? 1 : 2;
108  }
109  }
110 
111  fclose (infilefp);
112 
113  /*
114  Now read the combining character code points. These have width of 0.
115  */
116  if ((infilefp = fopen (argv[2],"r")) == NULL) {
117  fprintf (stderr,"ERROR - combining characters file %s not found.\n\n", argv[2]);
118  exit (EXIT_FAILURE);
119  }
120 
121  while (fgets (teststring, MAXSTRING-1, infilefp) != NULL) {
122  sscanf (teststring, "%X:%*s", &loc);
123  if (loc < 0x20000) glyph_width[loc] = 0;
124  }
125 
126  fclose (infilefp);
127 
128  /*
129  Code Points with Unusual Properties (Unicode Standard, Chapter 4).
130 
131  As of Unifont 10.0.04, use the widths in the "*-nonprinting.hex"
132  files. If an application is smart enough to know how to handle
133  these special cases, it will not render the "nonprinting" glyph
134  and will treat the code point as being zero-width.
135  */
136 // glyph_width[0]=0; /* NULL character */
137 // for (i = 0x0001; i <= 0x001F; i++) glyph_width[i]=-1; /* Control Characters */
138 // for (i = 0x007F; i <= 0x009F; i++) glyph_width[i]=-1; /* Control Characters */
139 
140 // glyph_width[0x034F]=0; /* combining grapheme joiner */
141 // glyph_width[0x180B]=0; /* Mongolian free variation selector one */
142 // glyph_width[0x180C]=0; /* Mongolian free variation selector two */
143 // glyph_width[0x180D]=0; /* Mongolian free variation selector three */
144 // glyph_width[0x180E]=0; /* Mongolian vowel separator */
145 // glyph_width[0x200B]=0; /* zero width space */
146 // glyph_width[0x200C]=0; /* zero width non-joiner */
147 // glyph_width[0x200D]=0; /* zero width joiner */
148 // glyph_width[0x200E]=0; /* left-to-right mark */
149 // glyph_width[0x200F]=0; /* right-to-left mark */
150 // glyph_width[0x202A]=0; /* left-to-right embedding */
151 // glyph_width[0x202B]=0; /* right-to-left embedding */
152 // glyph_width[0x202C]=0; /* pop directional formatting */
153 // glyph_width[0x202D]=0; /* left-to-right override */
154 // glyph_width[0x202E]=0; /* right-to-left override */
155 // glyph_width[0x2060]=0; /* word joiner */
156 // glyph_width[0x2061]=0; /* function application */
157 // glyph_width[0x2062]=0; /* invisible times */
158 // glyph_width[0x2063]=0; /* invisible separator */
159 // glyph_width[0x2064]=0; /* invisible plus */
160 // glyph_width[0x206A]=0; /* inhibit symmetric swapping */
161 // glyph_width[0x206B]=0; /* activate symmetric swapping */
162 // glyph_width[0x206C]=0; /* inhibit arabic form shaping */
163 // glyph_width[0x206D]=0; /* activate arabic form shaping */
164 // glyph_width[0x206E]=0; /* national digit shapes */
165 // glyph_width[0x206F]=0; /* nominal digit shapes */
166 
167 // /* Variation Selector-1 to Variation Selector-16 */
168 // for (i = 0xFE00; i <= 0xFE0F; i++) glyph_width[i] = 0;
169 
170 // glyph_width[0xFEFF]=0; /* zero width no-break space */
171 // glyph_width[0xFFF9]=0; /* interlinear annotation anchor */
172 // glyph_width[0xFFFA]=0; /* interlinear annotation separator */
173 // glyph_width[0xFFFB]=0; /* interlinear annotation terminator */
174  /*
175  Let glyph widths represent 0xFFFC (object replacement character)
176  and 0xFFFD (replacement character).
177  */
178 
179  /*
180  Hangul Jamo:
181 
182  Leading Consonant (Choseong): leave spacing as is.
183 
184  Hangul Choseong Filler (U+115F): set width to 2.
185 
186  Hangul Jungseong Filler, Hangul Vowel (Jungseong), and
187  Final Consonant (Jongseong): set width to 0, because these
188  combine with the leading consonant as one composite syllabic
189  glyph. As of Unicode 5.2, the Hangul Jamo block (U+1100..U+11FF)
190  is completely filled.
191  */
192  // for (i = 0x1160; i <= 0x11FF; i++) glyph_width[i]=0; /* Vowels & Final Consonants */
193 
194  /*
195  Private Use Area -- the width is undefined, but likely
196  to be 2 charcells wide either from a graphic glyph or
197  from a four-digit hexadecimal glyph representing the
198  code point. Therefore if any PUA glyph does not have
199  a non-zero width yet, assign it a default width of 2.
200  The Unicode Standard allows giving PUA characters
201  default property values; see for example The Unicode
202  Standard Version 5.0, p. 91. This same default is
203  used for higher plane PUA code points below.
204  */
205  // for (i = 0xE000; i <= 0xF8FF; i++) {
206  // if (glyph_width[i] == 0) glyph_width[i]=2;
207  // }
208 
209  /*
210  <not a character>
211  */
212  for (i = 0xFDD0; i <= 0xFDEF; i++) glyph_width[i] = -1;
213  glyph_width[0xFFFE] = -1; /* Byte Order Mark */
214  glyph_width[0xFFFF] = -1; /* Byte Order Mark */
215 
216  /* Surrogate Code Points */
217  for (i = 0xD800; i <= 0xDFFF; i++) glyph_width[i]=-1;
218 
219  /* CJK Code Points */
220  for (i = 0x4E00; i <= 0x9FFF; i++) if (glyph_width[i] < 0) glyph_width[i] = 2;
221  for (i = 0x3400; i <= 0x4DBF; i++) if (glyph_width[i] < 0) glyph_width[i] = 2;
222  for (i = 0xF900; i <= 0xFAFF; i++) if (glyph_width[i] < 0) glyph_width[i] = 2;
223 
224  /*
225  Now generate the output file.
226  */
227  printf ("/*\n");
228  printf (" wcwidth and wcswidth functions, as per IEEE 1003.1-2008\n");
229  printf (" System Interfaces, pp. 2241 and 2251.\n\n");
230  printf (" Author: Paul Hardy, 2013\n\n");
231  printf (" Copyright (c) 2013 Paul Hardy\n\n");
232  printf (" LICENSE:\n");
233  printf ("\n");
234  printf (" This program is free software: you can redistribute it and/or modify\n");
235  printf (" it under the terms of the GNU General Public License as published by\n");
236  printf (" the Free Software Foundation, either version 2 of the License, or\n");
237  printf (" (at your option) any later version.\n");
238  printf ("\n");
239  printf (" This program is distributed in the hope that it will be useful,\n");
240  printf (" but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
241  printf (" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
242  printf (" GNU General Public License for more details.\n");
243  printf ("\n");
244  printf (" You should have received a copy of the GNU General Public License\n");
245  printf (" along with this program. If not, see <http://www.gnu.org/licenses/>.\n");
246  printf ("*/\n\n");
247 
248  printf ("#include <wchar.h>\n\n");
249  printf ("/* Definitions for Pikto CSUR Private Use Area glyphs */\n");
250  printf ("#define PIKTO_START\t0x%06X\n", PIKTO_START);
251  printf ("#define PIKTO_END\t0x%06X\n", PIKTO_END);
252  printf ("#define PIKTO_SIZE\t(PIKTO_END - PIKTO_START + 1)\n");
253  printf ("\n\n");
254  printf ("/* wcwidth -- return charcell positions of one code point */\n");
255  printf ("inline int\nwcwidth (wchar_t wc)\n{\n");
256  printf (" return (wcswidth (&wc, 1));\n");
257  printf ("}\n");
258  printf ("\n\n");
259  printf ("int\nwcswidth (const wchar_t *pwcs, size_t n)\n{\n\n");
260  printf (" int i; /* loop variable */\n");
261  printf (" unsigned codept; /* Unicode code point of current character */\n");
262  printf (" unsigned plane; /* Unicode plane, 0x00..0x10 */\n");
263  printf (" unsigned lower17; /* lower 17 bits of Unicode code point */\n");
264  printf (" unsigned lower16; /* lower 16 bits of Unicode code point */\n");
265  printf (" int lowpt, midpt, highpt; /* for binary searching in plane1zeroes[] */\n");
266  printf (" int found; /* for binary searching in plane1zeroes[] */\n");
267  printf (" int totalwidth; /* total width of string, in charcells (1 or 2/glyph) */\n");
268  printf (" int illegalchar; /* Whether or not this code point is illegal */\n");
269  putchar ('\n');
270 
271  /*
272  Print the glyph_width[] array for glyphs widths in the
273  Basic Multilingual Plane (Plane 0).
274  */
275  printf (" char glyph_width[0x20000] = {");
276  for (i = 0; i < 0x10000; i++) {
277  if ((i & 0x1F) == 0)
278  printf ("\n /* U+%04X */ ", i);
279  printf ("%d,", glyph_width[i]);
280  }
281  for (i = 0x10000; i < 0x20000; i++) {
282  if ((i & 0x1F) == 0)
283  printf ("\n /* U+%06X */ ", i);
284  printf ("%d", glyph_width[i]);
285  if (i < 0x1FFFF) putchar (',');
286  }
287  printf ("\n };\n\n");
288 
289  /*
290  Print the pikto_width[] array for Pikto glyph widths.
291  */
292  printf (" char pikto_width[PIKTO_SIZE] = {");
293  for (i = 0; i < PIKTO_SIZE; i++) {
294  if ((i & 0x1F) == 0)
295  printf ("\n /* U+%06X */ ", PIKTO_START + i);
296  printf ("%d", pikto_width[i]);
297  if ((PIKTO_START + i) < PIKTO_END) putchar (',');
298  }
299  printf ("\n };\n\n");
300 
301  /*
302  Execution part of wcswidth.
303  */
304  printf ("\n");
305  printf (" illegalchar = totalwidth = 0;\n");
306  printf (" for (i = 0; !illegalchar && i < n; i++) {\n");
307  printf (" codept = pwcs[i];\n");
308  printf (" plane = codept >> 16;\n");
309  printf (" lower17 = codept & 0x1FFFF;\n");
310  printf (" lower16 = codept & 0xFFFF;\n");
311  printf (" if (plane < 2) { /* the most common case */\n");
312  printf (" if (glyph_width[lower17] < 0) illegalchar = 1;\n");
313  printf (" else totalwidth += glyph_width[lower17];\n");
314  printf (" }\n");
315  printf (" else { /* a higher plane or beyond Unicode range */\n");
316  printf (" if ((lower16 == 0xFFFE) || (lower16 == 0xFFFF)) {\n");
317  printf (" illegalchar = 1;\n");
318  printf (" }\n");
319  printf (" else if (plane < 4) { /* Ideographic Plane */\n");
320  printf (" totalwidth += 2; /* Default ideographic width */\n");
321  printf (" }\n");
322  printf (" else if (plane == 0x0F) { /* CSUR Private Use Area */\n");
323  printf (" if (lower16 <= 0x0E6F) { /* Kinya */\n");
324  printf (" totalwidth++; /* all Kinya syllables have width 1 */\n");
325  printf (" }\n");
326  printf (" else if (lower16 <= (PIKTO_END & 0xFFFF)) { /* Pikto */\n");
327  printf (" if (pikto_width[lower16 - (PIKTO_START & 0xFFFF)] < 0) illegalchar = 1;\n");
328  printf (" else totalwidth += pikto_width[lower16 - (PIKTO_START & 0xFFFF)];\n");
329  printf (" }\n");
330  printf (" }\n");
331  printf (" else if (plane > 0x10) {\n");
332  printf (" illegalchar = 1;\n");
333  printf (" }\n");
334  printf (" /* Other non-printing in higher planes; return -1 as per IEEE 1003.1-2008. */\n");
335  printf (" else if (/* language tags */\n");
336  printf (" codept == 0x0E0001 || (codept >= 0x0E0020 && codept <= 0x0E007F) ||\n");
337  printf (" /* variation selectors, 0x0E0100..0x0E01EF */\n");
338  printf (" (codept >= 0x0E0100 && codept <= 0x0E01EF)) {\n");
339  printf (" illegalchar = 1;\n");
340  printf (" }\n");
341  printf (" /*\n");
342  printf (" Unicode plane 0x02..0x10 printing character\n");
343  printf (" */\n");
344  printf (" else {\n");
345  printf (" illegalchar = 1; /* code is not in font */\n");
346  printf (" }\n");
347  printf ("\n");
348  printf (" }\n");
349  printf (" }\n");
350  printf (" if (illegalchar) totalwidth = -1;\n");
351  printf ("\n");
352  printf (" return (totalwidth);\n");
353  printf ("\n");
354  printf ("}\n");
355 
356  exit (EXIT_SUCCESS);
357 }
PIKTO_SIZE
#define PIKTO_SIZE
Definition: unigenwidth.c:52
PIKTO_END
#define PIKTO_END
End of Pikto code point range.
Definition: unigenwidth.c:50
PIKTO_START
#define PIKTO_START
Start of Pikto code point range.
Definition: unigenwidth.c:49
MAXSTRING
#define MAXSTRING
Maximum input line length - 1.
Definition: unigenwidth.c:46
main
int main(int argc, char **argv)
The main function.
Definition: unigenwidth.c:63