File: /usr/src/linux/fs/udf/unicode.c

1     /*
2      * unicode.c
3      *
4      * PURPOSE
5      *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6      *      Also handles filename mangling
7      *
8      * DESCRIPTION
9      *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10      *		http://www.osta.org/
11      *	UTF-8 is explained in the IETF RFC XXXX.
12      *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13      *
14      * CONTACTS
15      *	E-mail regarding any portion of the Linux UDF file system should be
16      *	directed to the development team's mailing list (run by majordomo):
17      *		linux_udf@hpesjro.fc.hp.com
18      *
19      * COPYRIGHT
20      *	This file is distributed under the terms of the GNU General Public
21      *	License (GPL). Copies of the GPL can be obtained from:
22      *		ftp://prep.ai.mit.edu/pub/gnu/GPL
23      *	Each contributing author retains all rights to their own work.
24      */
25     
26     
27     #ifdef __KERNEL__
28     #include <linux/kernel.h>
29     #include <linux/string.h>	/* for memset */
30     #include <linux/nls.h>
31     #include <linux/udf_fs.h>
32     #include "udf_sb.h"
33     #else
34     #include <string.h>
35     #endif
36     
37     #include "udfdecl.h"
38     
39     int udf_ustr_to_dchars(Uint8 *dest, const struct ustr *src, int strlen)
40     {
41     	if ( (!dest) || (!src) || (!strlen) || (src->u_len > strlen) )
42     		return 0;
43     	memcpy(dest+1, src->u_name, src->u_len);
44     	dest[0] = src->u_cmpID;
45     	return src->u_len + 1;
46     }
47     
48     int udf_ustr_to_char(Uint8 *dest, const struct ustr *src, int strlen)
49     {
50     	if ( (!dest) || (!src) || (!strlen) || (src->u_len >= strlen) )
51     		return 0;
52     	memcpy(dest, src->u_name, src->u_len);
53     	return src->u_len;
54     }
55     
56     int udf_ustr_to_dstring(dstring *dest, const struct ustr *src, int dlength)
57     {
58     	if ( udf_ustr_to_dchars(dest, src, dlength-1) )
59     	{
60     		dest[dlength-1] = src->u_len + 1;
61     		return dlength;
62     	}
63     	else
64     		return 0;
65     }
66     
67     int udf_dchars_to_ustr(struct ustr *dest, const Uint8 *src, int strlen)
68     {
69     	if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN) )
70     		return 0;
71     	memset(dest, 0, sizeof(struct ustr));
72     	memcpy(dest->u_name, src+1, strlen-1);
73     	dest->u_cmpID = src[0];
74     	dest->u_len = strlen-1;
75     	return strlen-1;
76     }
77     
78     int udf_char_to_ustr(struct ustr *dest, const Uint8 *src, int strlen)
79     {
80     	if ( (!dest) || (!src) || (!strlen) || (strlen >= UDF_NAME_LEN) )
81     		return 0;
82     	memset(dest, 0, sizeof(struct ustr));
83     	memcpy(dest->u_name, src, strlen);
84     	dest->u_cmpID = 0x08;
85     	dest->u_len = strlen;
86     	return strlen;
87     }
88     
89     
90     int udf_dstring_to_ustr(struct ustr *dest, const dstring *src, int dlength)
91     {
92     	if ( dlength && udf_dchars_to_ustr(dest, src, src[dlength-1]) )
93     		return dlength;
94     	else
95     		return 0;
96     }
97     
98     /*
99      * udf_build_ustr
100      */
101     int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
102     {
103     	int usesize;
104     
105     	if ( (!dest) || (!ptr) || (!size) )
106     		return -1;
107     
108     	memset(dest, 0, sizeof(struct ustr));
109     	usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
110     	dest->u_cmpID=ptr[0];
111     	dest->u_len=ptr[size-1];
112     	memcpy(dest->u_name, ptr+1, usesize-1);
113     	return 0;
114     }
115     
116     /*
117      * udf_build_ustr_exact
118      */
119     int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
120     {
121     	if ( (!dest) || (!ptr) || (!exactsize) )
122     		return -1;
123     
124     	memset(dest, 0, sizeof(struct ustr));
125     	dest->u_cmpID=ptr[0];
126     	dest->u_len=exactsize-1;
127     	memcpy(dest->u_name, ptr+1, exactsize-1);
128     	return 0;
129     }
130     
131     /*
132      * udf_ocu_to_utf8
133      *
134      * PURPOSE
135      *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
136      *
137      * DESCRIPTION
138      *	This routine is only called by udf_filldir().
139      *
140      * PRE-CONDITIONS
141      *	utf			Pointer to UTF-8 output buffer.
142      *	ocu			Pointer to OSTA Compressed Unicode input buffer
143      *				of size UDF_NAME_LEN bytes.
144      * 				both of type "struct ustr *"
145      *
146      * POST-CONDITIONS
147      *	<return>		Zero on success.
148      *
149      * HISTORY
150      *	November 12, 1997 - Andrew E. Mileski
151      *	Written, tested, and released.
152      */
153     int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
154     {
155     	Uint8 *ocu;
156     	Uint32 c;
157     	Uint8 cmp_id, ocu_len;
158     	int i;
159     
160     	ocu = ocu_i->u_name;
161     
162     	ocu_len = ocu_i->u_len;
163     	cmp_id = ocu_i->u_cmpID;
164     	utf_o->u_len = 0;
165     
166     	if (ocu_len == 0)
167     	{
168     		memset(utf_o, 0, sizeof(struct ustr));
169     		utf_o->u_cmpID = 0;
170     		utf_o->u_len = 0;
171     		return 0;
172     	}
173     
174     	if ((cmp_id != 8) && (cmp_id != 16))
175     	{
176     #ifdef __KERNEL__
177     		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
178     #endif
179     		return 0;
180     	}
181     
182     	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
183     	{
184     
185     		/* Expand OSTA compressed Unicode to Unicode */
186     		c = ocu[i++];
187     		if (cmp_id == 16)
188     			c = (c << 8) | ocu[i++];
189     
190     		/* Compress Unicode to UTF-8 */
191     		if (c < 0x80U)
192     			utf_o->u_name[utf_o->u_len++] = (Uint8)c;
193     		else if (c < 0x800U)
194     		{
195     			utf_o->u_name[utf_o->u_len++] = (Uint8)(0xc0 | (c >> 6));
196     			utf_o->u_name[utf_o->u_len++] = (Uint8)(0x80 | (c & 0x3f));
197     		}
198     		else
199     		{
200     			utf_o->u_name[utf_o->u_len++] = (Uint8)(0xe0 | (c >> 12));
201     			utf_o->u_name[utf_o->u_len++] = (Uint8)(0x80 | ((c >> 6) & 0x3f));
202     			utf_o->u_name[utf_o->u_len++] = (Uint8)(0x80 | (c & 0x3f));
203     		}
204     	}
205     	utf_o->u_cmpID=8;
206     	utf_o->u_hash=0L;
207     	utf_o->padding=0;
208     
209     	return utf_o->u_len;
210     }
211     
212     /*
213      *
214      * udf_utf8_to_ocu
215      *
216      * PURPOSE
217      *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
218      *
219      * DESCRIPTION
220      *	This routine is only called by udf_lookup().
221      *
222      * PRE-CONDITIONS
223      *	ocu			Pointer to OSTA Compressed Unicode output
224      *				buffer of size UDF_NAME_LEN bytes.
225      *	utf			Pointer to UTF-8 input buffer.
226      *	utf_len			Length of UTF-8 input buffer in bytes.
227      *
228      * POST-CONDITIONS
229      *	<return>		Zero on success.
230      *
231      * HISTORY
232      *	November 12, 1997 - Andrew E. Mileski
233      *	Written, tested, and released.
234      */
235     int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
236     {
237     	unsigned c, i, max_val, utf_char;
238     	int utf_cnt;
239     	int u_len = 0;
240     
241     	memset(ocu, 0, sizeof(dstring) * length);
242     	ocu[0] = 8;
243     	max_val = 0xffU;
244     
245     try_again:
246     	utf_char = 0U;
247     	utf_cnt = 0U;
248     	for (i = 0U; i < utf->u_len; i++)
249     	{
250     		c = (Uint8)utf->u_name[i];
251     
252     		/* Complete a multi-byte UTF-8 character */
253     		if (utf_cnt)
254     		{
255     			utf_char = (utf_char << 6) | (c & 0x3fU);
256     			if (--utf_cnt)
257     				continue;
258     		}
259     		else
260     		{
261     			/* Check for a multi-byte UTF-8 character */
262     			if (c & 0x80U)
263     			{
264     				/* Start a multi-byte UTF-8 character */
265     				if ((c & 0xe0U) == 0xc0U)
266     				{
267     					utf_char = c & 0x1fU;
268     					utf_cnt = 1;
269     				}
270     				else if ((c & 0xf0U) == 0xe0U)
271     				{
272     					utf_char = c & 0x0fU;
273     					utf_cnt = 2;
274     				}
275     				else if ((c & 0xf8U) == 0xf0U)
276     				{
277     					utf_char = c & 0x07U;
278     					utf_cnt = 3;
279     				}
280     				else if ((c & 0xfcU) == 0xf8U)
281     				{
282     					utf_char = c & 0x03U;
283     					utf_cnt = 4;
284     				}
285     				else if ((c & 0xfeU) == 0xfcU)
286     				{
287     					utf_char = c & 0x01U;
288     					utf_cnt = 5;
289     				}
290     				else
291     					goto error_out;
292     				continue;
293     			} else
294     				/* Single byte UTF-8 character (most common) */
295     				utf_char = c;
296     		}
297     
298     		/* Choose no compression if necessary */
299     		if (utf_char > max_val)
300     		{
301     			if ( 0xffU == max_val )
302     			{
303     				max_val = 0xffffU;
304     				ocu[0] = (Uint8)0x10U;
305     				goto try_again;
306     			}
307     			goto error_out;
308     		}
309     
310     		if (max_val == 0xffffU)
311     		{
312     			ocu[++u_len] = (Uint8)(utf_char >> 8);
313     		}
314     		ocu[++u_len] = (Uint8)(utf_char & 0xffU);
315     	}
316     
317     
318     	if (utf_cnt)
319     	{
320     error_out:
321     #ifdef __KERNEL__
322     		printk(KERN_ERR "udf: bad UTF-8 character\n");
323     #endif
324     		return 0;
325     	}
326     
327     	ocu[length - 1] = (Uint8)u_len + 1;
328     	return u_len + 1;
329     }
330     
331     #ifdef __KERNEL__
332     int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i)
333     {
334     	Uint8 *ocu;
335     	Uint32 c;
336     	Uint8 cmp_id, ocu_len;
337     	int i;
338     
339     	ocu = ocu_i->u_name;
340     
341     	ocu_len = ocu_i->u_len;
342     	cmp_id = ocu_i->u_cmpID;
343     	utf_o->u_len = 0;
344     
345     	if (ocu_len == 0)
346     	{
347     		memset(utf_o, 0, sizeof(struct ustr));
348     		utf_o->u_cmpID = 0;
349     		utf_o->u_len = 0;
350     		return 0;
351     	}
352     
353     	if ((cmp_id != 8) && (cmp_id != 16))
354     	{
355     		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
356     		return 0;
357     	}
358     
359     	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
360     	{
361     		/* Expand OSTA compressed Unicode to Unicode */
362     		c = ocu[i++];
363     		if (cmp_id == 16)
364     			c = (c << 8) | ocu[i++];
365     
366     		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 
367     			UDF_NAME_LEN - utf_o->u_len);
368     	}
369     	utf_o->u_cmpID=8;
370     	utf_o->u_hash=0L;
371     	utf_o->padding=0;
372     
373     	return utf_o->u_len;
374     }
375     
376     int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length)
377     {
378     	unsigned len, i, max_val;
379     	Uint16 uni_char;
380     	int uni_cnt;
381     	int u_len = 0;
382     
383     	memset(ocu, 0, sizeof(dstring) * length);
384     	ocu[0] = 8;
385     	max_val = 0xffU;
386     
387     try_again:
388     	uni_char = 0U;
389     	uni_cnt = 0U;
390     	for (i = 0U; i < uni->u_len; i++)
391     	{
392     		len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char);
393     
394     		if (len == 2 && max_val == 0xff)
395     		{
396     			max_val = 0xffffU;
397     			ocu[0] = (Uint8)0x10U;
398     			goto try_again;
399     		}
400     		
401     		if (max_val == 0xffffU)
402     		{
403     			ocu[++u_len] = (Uint8)(uni_char >> 8);
404     			i++;
405     		}
406     		ocu[++u_len] = (Uint8)(uni_char & 0xffU);
407     	}
408     
409     	ocu[length - 1] = (Uint8)u_len + 1;
410     	return u_len + 1;
411     }
412     
413     int udf_get_filename(struct super_block *sb, Uint8 *sname, Uint8 *dname, int flen)
414     {
415     	struct ustr filename, unifilename;
416     	int len;
417     
418     	if (udf_build_ustr_exact(&unifilename, sname, flen))
419     	{
420     		return 0;
421     	}
422     
423     	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
424     	{
425     		if (!udf_CS0toUTF8(&filename, &unifilename) )
426     		{
427     			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
428     			return 0;
429     		}
430     	}
431     	else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
432     	{
433     		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) )
434     		{
435     			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
436     			return 0;
437     		}
438     	}
439     	else
440     		return 0;
441     
442     	if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
443     		unifilename.u_name, unifilename.u_len)))
444     	{
445     		return len;
446     	}
447     	return 0;
448     }
449     #endif
450     
451     #define ILLEGAL_CHAR_MARK	'_'
452     #define EXT_MARK			'.'
453     #define CRC_MARK			'#'
454     #define EXT_SIZE			5
455     
456     int udf_translate_to_linux(Uint8 *newName, Uint8 *udfName, int udfLen, Uint8 *fidName, int fidNameLen)
457     {
458     	int index, newIndex = 0, needsCRC = 0;	
459     	int extIndex = 0, newExtIndex = 0, hasExt = 0;
460     	unsigned short valueCRC;
461     	Uint8 curr;
462     	const Uint8 hexChar[] = "0123456789ABCDEF";
463     
464     	if (udfName[0] == '.' && (udfLen == 1 ||
465     		(udfLen == 2 && udfName[1] == '.')))
466     	{
467     		needsCRC = 1;
468     		newIndex = udfLen;
469     		memcpy(newName, udfName, udfLen);
470     	}
471     	else
472     	{	
473     		for (index = 0; index < udfLen; index++)
474     		{
475     			curr = udfName[index];
476     			if (curr == '/' || curr == 0)
477     			{
478     				needsCRC = 1;
479     				curr = ILLEGAL_CHAR_MARK;
480     				while (index+1 < udfLen && (udfName[index+1] == '/' ||
481     					udfName[index+1] == 0))
482     					index++;
483     			}
484     			if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE)
485     			{
486     				if (udfLen == index + 1)
487     					hasExt = 0;
488     				else
489     				{
490     					hasExt = 1;
491     					extIndex = index;
492     					newExtIndex = newIndex;
493     				}
494     			}
495     			if (newIndex < 256)
496     				newName[newIndex++] = curr;
497     			else
498     				needsCRC = 1;
499     		}
500     	}
501     	if (needsCRC)
502     	{
503     		Uint8 ext[EXT_SIZE];
504     		int localExtIndex = 0;
505     
506     		if (hasExt)
507     		{
508     			int maxFilenameLen;
509     			for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen;
510     				index++ )
511     			{
512     				curr = udfName[extIndex + index + 1];
513     
514     				if (curr == '/' || curr == 0)
515     				{
516     					needsCRC = 1;
517     					curr = ILLEGAL_CHAR_MARK;
518     					while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE
519     						&& (udfName[extIndex + index + 2] == '/' ||
520     							udfName[extIndex + index + 2] == 0)))
521     						index++;
522     				}
523     				ext[localExtIndex++] = curr;
524     			}
525     			maxFilenameLen = 250 - localExtIndex;
526     			if (newIndex > maxFilenameLen)
527     				newIndex = maxFilenameLen;
528     			else
529     				newIndex = newExtIndex;
530     		}
531     		else if (newIndex > 250)
532     			newIndex = 250;
533     		newName[newIndex++] = CRC_MARK;
534     		valueCRC = udf_crc(fidName, fidNameLen, 0);
535     		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
536     		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
537     		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
538     		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
539     
540     		if (hasExt)
541     		{
542     			newName[newIndex++] = EXT_MARK;
543     			for (index = 0;index < localExtIndex ;index++ )
544     				newName[newIndex++] = ext[index];
545     		}
546     	}
547     	return newIndex;
548     }
549