/* * call-seq: * NKF.guess1(str) -> integer * * Returns guessed encoding of _str_ as integer. * * Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. * * case NKF.guess1(input) * when NKF::JIS * "ISO-2022-JP" * when NKF::SJIS * "Shift_JIS" * when NKF::EUC * "EUC-JP" * when NKF::UNKNOWN * "UNKNOWN(ASCII)" * when NKF::BINARY * "BINARY" * end */ static VALUE rb_nkf_guess1(obj, src) VALUE obj, src; { unsigned char *p; unsigned char *pend; int sequence_counter = 0; StringValue(src); p = (unsigned char *)RSTRING(src)->ptr; pend = p + RSTRING(src)->len; if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ p++;\ if (p==pend) return INT2FIX(_UNKNOWN);\ sequence_counter++;\ if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ if (6 <= sequence_counter) {\ sequence_counter = 0;\ return INT2FIX(_EUC);\ }\ } while (0) if (*p == 0xa4) sequence_counter = 1; while (p<pend) { if (*p == '\033') { return INT2FIX(_JIS); } if (*p < '\006' || *p == 0x7f || *p == 0xff) { return INT2FIX(_BINARY); } if (0x81 <= *p && *p <= 0x8d) { return INT2FIX(_SJIS); } if (0x8f <= *p && *p <= 0x9f) { return INT2FIX(_SJIS); } if (*p == 0x8e) { /* SS2 */ INCR; if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xa0) || (0xe0 <= *p && *p <= 0xfc)) return INT2FIX(_SJIS); } else if (0xa1 <= *p && *p <= 0xdf) { INCR; if (0xf0 <= *p && *p <= 0xfe) return INT2FIX(_EUC); if (0xe0 <= *p && *p <= 0xef) { while (p < pend && *p >= 0x40) { if (*p >= 0x81) { if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) { return INT2FIX(_SJIS); } else if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } } else if (*p <= 0x9f) { return INT2FIX(_SJIS); } } else if (0xf0 <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } else if (0xe0 <= *p && *p <= 0xef) { INCR; if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xa0)) { return INT2FIX(_SJIS); } if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } return INT2FIX(_UNKNOWN); }