How to detect string encoding ?
13/12/03 10:10
Is it possible to auto-detect string encoding in a Cocoa program ?
There is no specific routine to do that in Cocoa (Carbon has one, see at the end of the example code). As a matter of good practice, the encoding style of a text should be determined by an higher level protocol outside of the text itself (like MIME types ad others…). To scan the text to determine its encoding may prove to be difficult and unreliable in the general case.
However, a lot of standard text format (like HTML, XML, …) do include in their document structure the definition of the encoding used. If you know about the kind of text you can then parse the beginning of it to determine its encoding.
in the case you write a text editor and you want to provide an auto-detect read file routine, you may try to read the file first in the most "high level" encoding and fall back to simpliest one in case of failure: try first in UTF-32, if it fails fall back to UTF-16, then UTF-8 then ASCII, ...
A sequence of
NSString *myString = nil ;
NSStringEncoding myEncodingToTest[] = { ...put here the encoding you want to test...};
int i, howManyEncodings = sizeof(myEncodingToTest) / sizeof(NSStringEncoding) ;
for(i=0; (i< howManyEncodings) && (myString == nil) ; i++) {
NS_DURING
myString = [[NSString alloc] initWithData:theData encoding:myEn] ;
NS_HANDLER
NS_ENDHANDLER
}
if (myString == nil) {
// failed to parse the string in a "valid" encoding
return NO ;
}
// success
return YES ;
will be a simplistic way of solving the problem. Once at the ASCII level, you still may need help from the user because, there is nothing to tell you if the ASCII file has been created with Windows, ISO or Mac encoding, etc. Just the ending of lines may give some pointing but even this is not 100% reliable...
A more customizable version of the above principle could be the following one (you may improve it to fit your needs, especially decide which chars below ' ' are acceptable and if 0x7F is valid or not in a string (often used as DEL character).
@interface NSStringEncodingDetector : NSObject {
@protected
NSStringEncoding _encoding ;
NSData *_data ;
const unsigned char *_buffer ;
const unsigned char *_position ;
const unsigned char *_eofPos ;
unsigned _bufLength ;
}
+ (NSArray *)validEncodingsForData:(NSData *)inData;
+ (id)newWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding ;
- (id)initWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding ;
- (BOOL)advance:(BOOL *)outEOF;
- (BOOL)acceptData;
- (unsigned)curPosition;
- (NSStringEncoding)encoding;
@end
#import "NSStringEncodingDetector.h"
@protocol NSStringEncodingDetectorTest
+ (id)newWithData:(NSData *)inData;
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip;
@end
@interface NSASCIIStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSNEXTSTEPStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSJapaneseEUCStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSUTF8StringEncodingEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSISOLatin1StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSSymbolStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSNonLossyASCIIStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSShiftJISStringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSISOLatin2StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSUnicodeStringEncodingDetector : NSStringEncodingDetector {
@protected
NSCharacterSet *_illCharSet ;
BOOL _swapBytes ;
BOOL _swapTested ;
BOOL _isUTF32 ;
}
@end
@interface NSWindowsCP1251StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSWindowsCP1252StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSWindowsCP1253StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSWindowsCP1254StringEncodingDetector : NSStringEncodingDetector {
}
@end
@interface NSWindowsCP1250StringEncodingDetector : NSStringEncodingDetector {
}
@end
enum {
EISO2022_esc_ASCII,
EISO2022_esc_JISRoman,
EISO2022_esc_JISC,
EISO2022_esc_JISX
};
@interface NSISO2022JPStringEncodingDetector : NSStringEncodingDetector {
@protected
int _currentEscEncoding ;
}
@end
@interface NSMacOSRomanStringEncodingDetector : NSStringEncodingDetector {
}
@end
@implementation NSASCIIStringEncodingDetector
+ (id)newWithData:(NSData *)inData
{
return [[[self alloc] initWithData:inData forEncoding:NSASCIIStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSNEXTSTEPStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSNEXTSTEPStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
// 8 bit ascii : we have to check if < 0x20
// r n t f are valid characters anything else is rejected
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSJapaneseEUCStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSJapaneseEUCStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 0x7F) && !((*inBuffer >= 0xA1) && (*inBuffer <= 0xFE)))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
@implementation NSUTF8StringEncodingEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSUTF8StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
int toSkip = trailingBytesForUTF8[*inBuffer]+1 ;
if (toSkip > inMaxLength)
return NO ;
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if (toSkip == 1) { // inBuffer[1] must be > 0x7F
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 2) { // inBuffer[1] must be > 0x7F
if (inBuffer[1] < 0x80)
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 3) { // inBuffer[1] and inBuffer[2] must be > 0x7F
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 4) { // inBuffer[1] and inBuffer[2] and inBuffer[3] must be > 0x7F
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80) || (inBuffer[3] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80) || (inBuffer[3] < 0x80) || (inBuffer[4] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
@end
@implementation NSISOLatin1StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISOLatin1StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer >= 127) && (*inBuffer <= 160))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSSymbolStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSSymbolStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) || (*inBuffer == 0xFF))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSNonLossyASCIIStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSNonLossyASCIIStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSShiftJISStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSShiftJISStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (inMaxLength < 2)
return NO ;
if (! ( ((*inBuffer >= 0x81) && (*inBuffer <= 0x9F)) || ((*inBuffer >= 0xE0) && (*inBuffer <= 0xEF)) ) )
return NO ;
if ( (*inBuffer == 0xEF) && !((inBuffer[1] >= 0x40) && (inBuffer[1] <= 0x9E)) )
return NO ;
if ( ((inBuffer[1] < 0x40) || (inBuffer[1] > 0xFC)) )
return NO ;
*outValidSeqtoSkip = 2 ;
return YES ;
}
@end
@implementation NSISOLatin2StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISOLatin2StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 0x7F) && (*inBuffer < 0xA0))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSUnicodeStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSUnicodeStringEncoding] autorelease] ;
}
- (id)init
{
if (self = [super init]) {
_illCharSet = [[NSCharacterSet illegalCharacterSet] retain] ;
_swapBytes = NO ;
_swapTested = NO ;
_isUTF32 = NO ;
}
return self ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (inMaxLength < sizeof(unichar))
return NO ;
if (!_swapTested) {
_swapTested = YES ;
if (*inBuffer == 0xFE) {
_swapBytes = NO ;
return (!_swapBytes && (inBuffer[1] == 0xFF)) ;
}
if (*inBuffer == 0xFF) {
_swapBytes = YES ;
*outValidSeqtoSkip = sizeof(unichar) ;
if (inMaxLength >= sizeof(UTF32Char)) {
_isUTF32 = (inBuffer[2] == 0) && (inBuffer[3] == 0) ;
if (_isUTF32)
*outValidSeqtoSkip = sizeof(UTF32Char) ;
return YES ;
}
return (inBuffer[1] == 0xFE) ;
}
if ((inBuffer[0] == 0) && (inBuffer[1] == 0x00)) {
if (inMaxLength < sizeof(UTF32Char))
return NO ;
if ((inBuffer[2] == 0xFE) && (inBuffer[3] == 0xFF)) {
_swapBytes = NO ;
_isUTF32 = YES ;
*outValidSeqtoSkip = sizeof(UTF32Char) ;
return YES ;
}
return NO ;
}
}
if (_isUTF32) {
if (inMaxLength < sizeof(UTF32Char))
return NO ;
UTF32Char unicode = (_swapBytes)? (( ((unichar *)inBuffer)[1] << 16) | ((unichar *)inBuffer)[0]):(*(UTF32Char *)inBuffer) ;
if ([_illCharSet longCharacterIsMember:unicode])
return NO ;
*outValidSeqtoSkip += sizeof(UTF32Char) ;
return YES ;
}
if (inMaxLength < sizeof(unichar))
return NO ;
unichar unicode = (_swapBytes)? ((inBuffer[1] << 8) | inBuffer[0]):(*(unichar *)inBuffer) ;
if ([_illCharSet characterIsMember:unicode])
return NO ;
*outValidSeqtoSkip += sizeof(unichar) ;
return YES ;
}
@end
@implementation NSWindowsCP1251StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1251StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 175) && (*inBuffer < 224))
return NO ;
if ((*inBuffer > 241) && (*inBuffer < 255))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1252StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1252StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x8D) ||
(*inBuffer == 0x8F) ||
(*inBuffer == 0x90) ||
(*inBuffer == 0x9D) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1253StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1253StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x88) ||
((*inBuffer >= 0x8C) && (*inBuffer <= 0x90)) ||
(*inBuffer == 0x98) ||
(*inBuffer == 0x9A) ||
(*inBuffer == 0x9C) ||
((*inBuffer >= 0x9C) && (*inBuffer <= 0x9F)) ||
(*inBuffer == 0xAA) ||
(*inBuffer == 0xD2) ||
(*inBuffer == 0xFF) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1254StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1254StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
((*inBuffer >= 0x8D) && (*inBuffer <= 0x90)) ||
(*inBuffer == 0x9D) ||
(*inBuffer == 0x9E) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1250StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1250StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x83) ||
(*inBuffer == 0x88) ||
(*inBuffer == 0x90) ||
(*inBuffer == 0x98) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSISO2022JPStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISO2022JPStringEncoding] autorelease] ;
}
- (id)init
{
if (self = [super init]) {
_currentEscEncoding = EISO2022_esc_ASCII ;
}
return self ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (*inBuffer < 0x20) {
if ((*inBuffer == 'n') && (*inBuffer == 'r') && (*inBuffer == 't') && (*inBuffer == 'f')) {
*outValidSeqtoSkip = 1 ;
return YES ;
}
if (*inBuffer == 0x1B) {
if (inMaxLength < 3)
return NO ;
if (inBuffer[1] == '$') {
if ((_currentEscEncoding != EISO2022_esc_ASCII) && (_currentEscEncoding != EISO2022_esc_JISRoman))
return NO ;
if (inBuffer[2] == '@') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISC ;
return YES ;
}
if (inBuffer[2] == 'B') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISX ;
return YES ;
}
return NO ;
}
if (inBuffer[1] == '(') {
if ((_currentEscEncoding != EISO2022_esc_JISC) && (_currentEscEncoding != EISO2022_esc_JISX))
return NO ;
if (inBuffer[2] == 'B') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_ASCII ;
return YES ;
}
if (inBuffer[2] == 'J') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISRoman ;
return YES ;
}
return NO ;
}
return NO ;
}
// we don't have an escape neither a valid < 0x20 char
return NO ;
}
// char >= 0x20
if ((_currentEscEncoding == EISO2022_esc_ASCII) || (_currentEscEncoding == EISO2022_esc_JISRoman)) {
*outValidSeqtoSkip = 1 ;
return YES ;
}
if (inMaxLength < 2)
return NO ;
if ((*inBuffer < 0x21) || (*inBuffer > 0x7E) || (inBuffer[1] < 0x21) || (inBuffer[1] > 0x7E))
return NO ;
*outValidSeqtoSkip = 2 ;
return YES ;
}
@end
@implementation NSMacOSRomanStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSMacOSRomanStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
// 8 bit ascii : we have to check if < 0x20
// r n t f are valid characters anything else is rejected
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSStringEncodingDetector
+ (NSArray *)validEncodingsForData:(NSData *)inData
{
NSMutableArray *result = [NSMutableArray array] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSASCIIStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSASCIIStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSNEXTSTEPStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSNEXTSTEPStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSJapaneseEUCStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSJapaneseEUCStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSUTF8StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSUTF8StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISOLatin1StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISOLatin1StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSSymbolStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSSymbolStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSNonLossyASCIIStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSNonLossyASCIIStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSShiftJISStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSShiftJISStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISOLatin2StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISOLatin2StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSUnicodeStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSUnicodeStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1251StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1251StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1252StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1252StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1253StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1253StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1254StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1254StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1250StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1250StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1250StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1250StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISO2022JPStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISO2022JPStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSMacOSRomanStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSMacOSRomanStringEncoding]] ;
return [NSArray arrayWithArray:result] ;
}
+ (id)newWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding
{
switch (inEncoding) {
case NSASCIIStringEncoding :
return [NSASCIIStringEncodingDetector newWithData:inData] ;
case NSNEXTSTEPStringEncoding :
return [NSNEXTSTEPStringEncodingDetector newWithData:inData] ;
case NSJapaneseEUCStringEncoding :
return [NSJapaneseEUCStringEncodingDetector newWithData:inData] ;
case NSUTF8StringEncoding :
return [NSUTF8StringEncodingEncodingDetector newWithData:inData] ;
case NSISOLatin1StringEncoding :
return [NSISOLatin1StringEncodingDetector newWithData:inData] ;
case NSSymbolStringEncoding :
return [NSSymbolStringEncodingDetector newWithData:inData] ;
case NSNonLossyASCIIStringEncoding :
return [NSNonLossyASCIIStringEncodingDetector newWithData:inData] ;
case NSShiftJISStringEncoding :
return [NSShiftJISStringEncodingDetector newWithData:inData] ;
case NSISOLatin2StringEncoding :
return [NSISOLatin2StringEncodingDetector newWithData:inData] ;
case NSUnicodeStringEncoding :
return [NSUnicodeStringEncodingDetector newWithData:inData] ;
case NSWindowsCP1251StringEncoding :
return [NSWindowsCP1251StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1252StringEncoding :
return [NSWindowsCP1252StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1253StringEncoding :
return [NSWindowsCP1253StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1254StringEncoding :
return [NSWindowsCP1254StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1250StringEncoding :
return [NSWindowsCP1250StringEncodingDetector newWithData:inData] ;
case NSISO2022JPStringEncoding :
return [NSISO2022JPStringEncodingDetector newWithData:inData] ;
case NSMacOSRomanStringEncoding :
return [NSMacOSRomanStringEncodingDetector newWithData:inData] ;
default :
return nil ;
}
}
- (id)initWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding
{
if (self = [self init]) {
_encoding = inEncoding ;
_data = [inData retain] ;
_buffer = [inData bytes] ;
_position = _buffer ;
_bufLength = [inData length] ;
_eofPos = _position + _bufLength ;
}
return self ;
}
- (void)dealloc
{
[_data release] ;
[super dealloc] ;
}
- (BOOL)advance:(BOOL *)outEOF
{
if (*outEOF = (_position >= _eofPos)) {
return NO ;
}
unsigned toSkip = 0 ;
NS_DURING
if ([(id)self isValidSequence:_position maxLength:(_eofPos - _position) toSkip:&toSkip]) {
NSAssert(toSkip != 0,@"toSkip == 0");
_position += toSkip ;
return YES ;
}
NS_HANDLER
NSLog(@"Problem with %@",[self class]) ;
NS_ENDHANDLER
return NO ;
}
- (BOOL)acceptData
{
BOOL eof = NO ;
while ([self advance:&eof] && !eof) ;
return eof ;
}
- (unsigned)curPosition
{
return _position - _buffer ;
}
- (NSStringEncoding)encoding
{
return _encoding ;
}
@end
And eventually, maybe the most powerfull one if you don't mind to link to Carbon:
#import
@interface NSStringEncodingSniffer : NSObject {
}
+ (NSArray *)sniffData:(NSData *)inString;
@end
#import
#import "NSStringEncodingSniffer.h"
static ItemCount _numEncoding = 0 ;
static BOOL _inited = NO ;
static TextEncoding *_availableEncodings = NULL ;
static TECSnifferObjectRef _encodingSniffer = NULL ;
static ItemCount *_numErrsArray = NULL ;
static ItemCount *_numFeaturesArray = NULL ;
static void _initSniffer(void)
{
OSStatus error ;
error = TECCountAvailableSniffers(&_numEncoding) ;
if (error == noErr) {
_availableEncodings = (TextEncoding *)malloc( _numEncoding * sizeof(TextEncoding) ) ;
_numErrsArray = (ItemCount *)malloc( _numEncoding * sizeof(ItemCount) ) ;
_numFeaturesArray = (ItemCount *)malloc( _numEncoding * sizeof(ItemCount) ) ;
error = TECGetAvailableSniffers(_availableEncodings, _numEncoding, &_numEncoding);
if (error == noErr)
error = TECCreateSniffer(&_encodingSniffer,_availableEncodings,_numEncoding);
}
_inited = (error == noErr) ;
}
@implementation NSStringEncodingSniffer
+ (NSArray *)sniffData:(NSData *)inString
{
OSStatus error ;
NSMutableArray *result = [NSMutableArray arrayWithCapacity:_numEncoding] ;
if (!_inited) {
_initSniffer();
}
if (_encodingSniffer != NULL) {
TextEncoding *availableEncodings = NULL ;
availableEncodings = (TextEncoding *)malloc( _numEncoding * sizeof(TextEncoding) ) ;
if (availableEncodings != NULL) {
memcpy(availableEncodings, _availableEncodings, _numEncoding * sizeof(TextEncoding)) ;
error = TECSniffTextEncoding (_encodingSniffer, (TextPtr)[inString bytes],
(ByteCount) [inString length], availableEncodings, _numEncoding, _numErrsArray,
_numEncoding, _numFeaturesArray, _numEncoding);
if (error == noErr) {
int i = 0 ;
while ((i< _numEncoding) && (_numErrsArray[i] == 0)) {
[result addObject:[NSNumber numberWithInt:availableEncodings[i]]] ;
++i ;
}
}
free(availableEncodings) ;
}
}
return [NSArray arrayWithArray:result] ;
}
@end
Note that this one returns an array of NSNumber containing TextEncoding of which range of values is much larger than NSEncoding.
Authored by: Admin on Monday, December 18 2006 @ 04:32 PM GMT
Other interesting links:
Text examples in various encoding
W3C about XML encoding detection
Charguess for Ruby
However, a lot of standard text format (like HTML, XML, …) do include in their document structure the definition of the encoding used. If you know about the kind of text you can then parse the beginning of it to determine its encoding.
in the case you write a text editor and you want to provide an auto-detect read file routine, you may try to read the file first in the most "high level" encoding and fall back to simpliest one in case of failure: try first in UTF-32, if it fails fall back to UTF-16, then UTF-8 then ASCII, ...
A sequence of
NSString *myString = nil ;
NSStringEncoding myEncodingToTest[] = { ...put here the encoding you want to test...};
int i, howManyEncodings = sizeof(myEncodingToTest) / sizeof(NSStringEncoding) ;
for(i=0; (i< howManyEncodings) && (myString == nil) ; i++) {
NS_DURING
myString = [[NSString alloc] initWithData:theData encoding:myEn] ;
NS_HANDLER
NS_ENDHANDLER
}
if (myString == nil) {
// failed to parse the string in a "valid" encoding
return NO ;
}
// success
return YES ;
will be a simplistic way of solving the problem. Once at the ASCII level, you still may need help from the user because, there is nothing to tell you if the ASCII file has been created with Windows, ISO or Mac encoding, etc. Just the ending of lines may give some pointing but even this is not 100% reliable...
A more customizable version of the above principle could be the following one (you may improve it to fit your needs, especially decide which chars below ' ' are acceptable and if 0x7F is valid or not in a string (often used as DEL character).
@interface NSStringEncodingDetector : NSObject {
@protected
NSStringEncoding _encoding ;
NSData *_data ;
const unsigned char *_buffer ;
const unsigned char *_position ;
const unsigned char *_eofPos ;
unsigned _bufLength ;
}
+ (NSArray *)validEncodingsForData:(NSData *)inData;
+ (id)newWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding ;
- (id)initWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding ;
- (BOOL)advance:(BOOL *)outEOF;
- (BOOL)acceptData;
- (unsigned)curPosition;
- (NSStringEncoding)encoding;
@end
#import "NSStringEncodingDetector.h"
@protocol NSStringEncodingDetectorTest
+ (id)newWithData:(NSData *)inData;
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip;
@end
@interface NSASCIIStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSNEXTSTEPStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSJapaneseEUCStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSUTF8StringEncodingEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSISOLatin1StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSSymbolStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSNonLossyASCIIStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSShiftJISStringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSISOLatin2StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSUnicodeStringEncodingDetector : NSStringEncodingDetector
@protected
NSCharacterSet *_illCharSet ;
BOOL _swapBytes ;
BOOL _swapTested ;
BOOL _isUTF32 ;
}
@end
@interface NSWindowsCP1251StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSWindowsCP1252StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSWindowsCP1253StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSWindowsCP1254StringEncodingDetector : NSStringEncodingDetector
}
@end
@interface NSWindowsCP1250StringEncodingDetector : NSStringEncodingDetector
}
@end
enum {
EISO2022_esc_ASCII,
EISO2022_esc_JISRoman,
EISO2022_esc_JISC,
EISO2022_esc_JISX
};
@interface NSISO2022JPStringEncodingDetector : NSStringEncodingDetector
@protected
int _currentEscEncoding ;
}
@end
@interface NSMacOSRomanStringEncodingDetector : NSStringEncodingDetector
}
@end
@implementation NSASCIIStringEncodingDetector
+ (id)newWithData:(NSData *)inData
{
return [[[self alloc] initWithData:inData forEncoding:NSASCIIStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSNEXTSTEPStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSNEXTSTEPStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
// 8 bit ascii : we have to check if < 0x20
// r n t f are valid characters anything else is rejected
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSJapaneseEUCStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSJapaneseEUCStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 0x7F) && !((*inBuffer >= 0xA1) && (*inBuffer <= 0xFE)))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
@implementation NSUTF8StringEncodingEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSUTF8StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
int toSkip = trailingBytesForUTF8[*inBuffer]+1 ;
if (toSkip > inMaxLength)
return NO ;
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if (toSkip == 1) { // inBuffer[1] must be > 0x7F
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 2) { // inBuffer[1] must be > 0x7F
if (inBuffer[1] < 0x80)
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 3) { // inBuffer[1] and inBuffer[2] must be > 0x7F
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if (toSkip == 4) { // inBuffer[1] and inBuffer[2] and inBuffer[3] must be > 0x7F
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80) || (inBuffer[3] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
if ((inBuffer[1] < 0x80) || (inBuffer[2] < 0x80) || (inBuffer[3] < 0x80) || (inBuffer[4] < 0x80))
return NO ;
*outValidSeqtoSkip = toSkip ;
return YES ;
}
@end
@implementation NSISOLatin1StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISOLatin1StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer >= 127) && (*inBuffer <= 160))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSSymbolStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSSymbolStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) || (*inBuffer == 0xFF))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSNonLossyASCIIStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSNonLossyASCIIStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSShiftJISStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSShiftJISStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (inMaxLength < 2)
return NO ;
if (! ( ((*inBuffer >= 0x81) && (*inBuffer <= 0x9F)) || ((*inBuffer >= 0xE0) && (*inBuffer <= 0xEF)) ) )
return NO ;
if ( (*inBuffer == 0xEF) && !((inBuffer[1] >= 0x40) && (inBuffer[1] <= 0x9E)) )
return NO ;
if ( ((inBuffer[1] < 0x40) || (inBuffer[1] > 0xFC)) )
return NO ;
*outValidSeqtoSkip = 2 ;
return YES ;
}
@end
@implementation NSISOLatin2StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISOLatin2StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 0x7F) && (*inBuffer < 0xA0))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSUnicodeStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSUnicodeStringEncoding] autorelease] ;
}
- (id)init
{
if (self = [super init]) {
_illCharSet = [[NSCharacterSet illegalCharacterSet] retain] ;
_swapBytes = NO ;
_swapTested = NO ;
_isUTF32 = NO ;
}
return self ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (inMaxLength < sizeof(unichar))
return NO ;
if (!_swapTested) {
_swapTested = YES ;
if (*inBuffer == 0xFE) {
_swapBytes = NO ;
return (!_swapBytes && (inBuffer[1] == 0xFF)) ;
}
if (*inBuffer == 0xFF) {
_swapBytes = YES ;
*outValidSeqtoSkip = sizeof(unichar) ;
if (inMaxLength >= sizeof(UTF32Char)) {
_isUTF32 = (inBuffer[2] == 0) && (inBuffer[3] == 0) ;
if (_isUTF32)
*outValidSeqtoSkip = sizeof(UTF32Char) ;
return YES ;
}
return (inBuffer[1] == 0xFE) ;
}
if ((inBuffer[0] == 0) && (inBuffer[1] == 0x00)) {
if (inMaxLength < sizeof(UTF32Char))
return NO ;
if ((inBuffer[2] == 0xFE) && (inBuffer[3] == 0xFF)) {
_swapBytes = NO ;
_isUTF32 = YES ;
*outValidSeqtoSkip = sizeof(UTF32Char) ;
return YES ;
}
return NO ;
}
}
if (_isUTF32) {
if (inMaxLength < sizeof(UTF32Char))
return NO ;
UTF32Char unicode = (_swapBytes)? (( ((unichar *)inBuffer)[1] << 16) | ((unichar *)inBuffer)[0]):(*(UTF32Char *)inBuffer) ;
if ([_illCharSet longCharacterIsMember:unicode])
return NO ;
*outValidSeqtoSkip += sizeof(UTF32Char) ;
return YES ;
}
if (inMaxLength < sizeof(unichar))
return NO ;
unichar unicode = (_swapBytes)? ((inBuffer[1] << 8) | inBuffer[0]):(*(unichar *)inBuffer) ;
if ([_illCharSet characterIsMember:unicode])
return NO ;
*outValidSeqtoSkip += sizeof(unichar) ;
return YES ;
}
@end
@implementation NSWindowsCP1251StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1251StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer > 175) && (*inBuffer < 224))
return NO ;
if ((*inBuffer > 241) && (*inBuffer < 255))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1252StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1252StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x8D) ||
(*inBuffer == 0x8F) ||
(*inBuffer == 0x90) ||
(*inBuffer == 0x9D) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1253StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1253StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x88) ||
((*inBuffer >= 0x8C) && (*inBuffer <= 0x90)) ||
(*inBuffer == 0x98) ||
(*inBuffer == 0x9A) ||
(*inBuffer == 0x9C) ||
((*inBuffer >= 0x9C) && (*inBuffer <= 0x9F)) ||
(*inBuffer == 0xAA) ||
(*inBuffer == 0xD2) ||
(*inBuffer == 0xFF) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1254StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1254StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
((*inBuffer >= 0x8D) && (*inBuffer <= 0x90)) ||
(*inBuffer == 0x9D) ||
(*inBuffer == 0x9E) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSWindowsCP1250StringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSWindowsCP1250StringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
if ((*inBuffer == 0x81) ||
(*inBuffer == 0x83) ||
(*inBuffer == 0x88) ||
(*inBuffer == 0x90) ||
(*inBuffer == 0x98) )
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSISO2022JPStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSISO2022JPStringEncoding] autorelease] ;
}
- (id)init
{
if (self = [super init]) {
_currentEscEncoding = EISO2022_esc_ASCII ;
}
return self ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
if (*inBuffer < 0x20) {
if ((*inBuffer == 'n') && (*inBuffer == 'r') && (*inBuffer == 't') && (*inBuffer == 'f')) {
*outValidSeqtoSkip = 1 ;
return YES ;
}
if (*inBuffer == 0x1B) {
if (inMaxLength < 3)
return NO ;
if (inBuffer[1] == '$') {
if ((_currentEscEncoding != EISO2022_esc_ASCII) && (_currentEscEncoding != EISO2022_esc_JISRoman))
return NO ;
if (inBuffer[2] == '@') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISC ;
return YES ;
}
if (inBuffer[2] == 'B') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISX ;
return YES ;
}
return NO ;
}
if (inBuffer[1] == '(') {
if ((_currentEscEncoding != EISO2022_esc_JISC) && (_currentEscEncoding != EISO2022_esc_JISX))
return NO ;
if (inBuffer[2] == 'B') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_ASCII ;
return YES ;
}
if (inBuffer[2] == 'J') {
*outValidSeqtoSkip = 3 ;
_currentEscEncoding = EISO2022_esc_JISRoman ;
return YES ;
}
return NO ;
}
return NO ;
}
// we don't have an escape neither a valid < 0x20 char
return NO ;
}
// char >= 0x20
if ((_currentEscEncoding == EISO2022_esc_ASCII) || (_currentEscEncoding == EISO2022_esc_JISRoman)) {
*outValidSeqtoSkip = 1 ;
return YES ;
}
if (inMaxLength < 2)
return NO ;
if ((*inBuffer < 0x21) || (*inBuffer > 0x7E) || (inBuffer[1] < 0x21) || (inBuffer[1] > 0x7E))
return NO ;
*outValidSeqtoSkip = 2 ;
return YES ;
}
@end
@implementation NSMacOSRomanStringEncodingDetector
+ (id)newWithData:(NSData *)inData;
{
return [[[self alloc] initWithData:inData forEncoding:NSMacOSRomanStringEncoding] autorelease] ;
}
- (BOOL)isValidSequence:(const unsigned char *)inBuffer maxLength:(unsigned)inMaxLength toSkip:(int *)outValidSeqtoSkip
{
// 8 bit ascii : we have to check if < 0x20
// r n t f are valid characters anything else is rejected
if ((*inBuffer < 0x20) && (*inBuffer != 'n') && (*inBuffer != 'r') && (*inBuffer != 't') && (*inBuffer != 'f'))
return NO ;
*outValidSeqtoSkip = 1 ;
return YES ;
}
@end
@implementation NSStringEncodingDetector
+ (NSArray *)validEncodingsForData:(NSData *)inData
{
NSMutableArray *result = [NSMutableArray array] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSASCIIStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSASCIIStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSNEXTSTEPStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSNEXTSTEPStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSJapaneseEUCStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSJapaneseEUCStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSUTF8StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSUTF8StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISOLatin1StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISOLatin1StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSSymbolStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSSymbolStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSNonLossyASCIIStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSNonLossyASCIIStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSShiftJISStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSShiftJISStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISOLatin2StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISOLatin2StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSUnicodeStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSUnicodeStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1251StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1251StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1252StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1252StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1253StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1253StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1254StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1254StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1250StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1250StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSWindowsCP1250StringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSWindowsCP1250StringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSISO2022JPStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSISO2022JPStringEncoding]] ;
if ([[NSStringEncodingDetector newWithData:inData forEncoding:NSMacOSRomanStringEncoding] acceptData])
[result addObject:[NSNumber numberWithInt:NSMacOSRomanStringEncoding]] ;
return [NSArray arrayWithArray:result] ;
}
+ (id)newWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding
{
switch (inEncoding) {
case NSASCIIStringEncoding :
return [NSASCIIStringEncodingDetector newWithData:inData] ;
case NSNEXTSTEPStringEncoding :
return [NSNEXTSTEPStringEncodingDetector newWithData:inData] ;
case NSJapaneseEUCStringEncoding :
return [NSJapaneseEUCStringEncodingDetector newWithData:inData] ;
case NSUTF8StringEncoding :
return [NSUTF8StringEncodingEncodingDetector newWithData:inData] ;
case NSISOLatin1StringEncoding :
return [NSISOLatin1StringEncodingDetector newWithData:inData] ;
case NSSymbolStringEncoding :
return [NSSymbolStringEncodingDetector newWithData:inData] ;
case NSNonLossyASCIIStringEncoding :
return [NSNonLossyASCIIStringEncodingDetector newWithData:inData] ;
case NSShiftJISStringEncoding :
return [NSShiftJISStringEncodingDetector newWithData:inData] ;
case NSISOLatin2StringEncoding :
return [NSISOLatin2StringEncodingDetector newWithData:inData] ;
case NSUnicodeStringEncoding :
return [NSUnicodeStringEncodingDetector newWithData:inData] ;
case NSWindowsCP1251StringEncoding :
return [NSWindowsCP1251StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1252StringEncoding :
return [NSWindowsCP1252StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1253StringEncoding :
return [NSWindowsCP1253StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1254StringEncoding :
return [NSWindowsCP1254StringEncodingDetector newWithData:inData] ;
case NSWindowsCP1250StringEncoding :
return [NSWindowsCP1250StringEncodingDetector newWithData:inData] ;
case NSISO2022JPStringEncoding :
return [NSISO2022JPStringEncodingDetector newWithData:inData] ;
case NSMacOSRomanStringEncoding :
return [NSMacOSRomanStringEncodingDetector newWithData:inData] ;
default :
return nil ;
}
}
- (id)initWithData:(NSData *)inData forEncoding:(NSStringEncoding)inEncoding
{
if (self = [self init]) {
_encoding = inEncoding ;
_data = [inData retain] ;
_buffer = [inData bytes] ;
_position = _buffer ;
_bufLength = [inData length] ;
_eofPos = _position + _bufLength ;
}
return self ;
}
- (void)dealloc
{
[_data release] ;
[super dealloc] ;
}
- (BOOL)advance:(BOOL *)outEOF
{
if (*outEOF = (_position >= _eofPos)) {
return NO ;
}
unsigned toSkip = 0 ;
NS_DURING
if ([(id
NSAssert(toSkip != 0,@"toSkip == 0");
_position += toSkip ;
return YES ;
}
NS_HANDLER
NSLog(@"Problem with %@",[self class]) ;
NS_ENDHANDLER
return NO ;
}
- (BOOL)acceptData
{
BOOL eof = NO ;
while ([self advance:&eof] && !eof) ;
return eof ;
}
- (unsigned)curPosition
{
return _position - _buffer ;
}
- (NSStringEncoding)encoding
{
return _encoding ;
}
@end
#import
@interface NSStringEncodingSniffer : NSObject {
}
+ (NSArray *)sniffData:(NSData *)inString;
@end
#import
#import "NSStringEncodingSniffer.h"
static ItemCount _numEncoding = 0 ;
static BOOL _inited = NO ;
static TextEncoding *_availableEncodings = NULL ;
static TECSnifferObjectRef _encodingSniffer = NULL ;
static ItemCount *_numErrsArray = NULL ;
static ItemCount *_numFeaturesArray = NULL ;
static void _initSniffer(void)
{
OSStatus error ;
error = TECCountAvailableSniffers(&_numEncoding) ;
if (error == noErr) {
_availableEncodings = (TextEncoding *)malloc( _numEncoding * sizeof(TextEncoding) ) ;
_numErrsArray = (ItemCount *)malloc( _numEncoding * sizeof(ItemCount) ) ;
_numFeaturesArray = (ItemCount *)malloc( _numEncoding * sizeof(ItemCount) ) ;
error = TECGetAvailableSniffers(_availableEncodings, _numEncoding, &_numEncoding);
if (error == noErr)
error = TECCreateSniffer(&_encodingSniffer,_availableEncodings,_numEncoding);
}
_inited = (error == noErr) ;
}
@implementation NSStringEncodingSniffer
+ (NSArray *)sniffData:(NSData *)inString
{
OSStatus error ;
NSMutableArray *result = [NSMutableArray arrayWithCapacity:_numEncoding] ;
if (!_inited) {
_initSniffer();
}
if (_encodingSniffer != NULL) {
TextEncoding *availableEncodings = NULL ;
availableEncodings = (TextEncoding *)malloc( _numEncoding * sizeof(TextEncoding) ) ;
if (availableEncodings != NULL) {
memcpy(availableEncodings, _availableEncodings, _numEncoding * sizeof(TextEncoding)) ;
error = TECSniffTextEncoding (_encodingSniffer, (TextPtr)[inString bytes],
(ByteCount) [inString length], availableEncodings, _numEncoding, _numErrsArray,
_numEncoding, _numFeaturesArray, _numEncoding);
if (error == noErr) {
int i = 0 ;
while ((i< _numEncoding) && (_numErrsArray[i] == 0)) {
[result addObject:[NSNumber numberWithInt:availableEncodings[i]]] ;
++i ;
}
}
free(availableEncodings) ;
}
}
return [NSArray arrayWithArray:result] ;
}
@end
Authored by: Admin on Monday, December 18 2006 @ 04:32 PM GMT
Other interesting links:
Text examples in various encoding
W3C about XML encoding detection
Charguess for Ruby