You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
234 lines
6.9 KiB
C++
234 lines
6.9 KiB
C++
// Copyright (C) 2007 Davis E. King (davis@dlib.net), and Nils Labugt
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#undef DLIB_UNICODe_ABSTRACT_H_
|
|
#ifdef DLIB_UNICODe_ABSTRACT_H_
|
|
|
|
#include "../uintn.h"
|
|
#include "../error.h"
|
|
#include <string>
|
|
#include <fstream>
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
// a typedef for an unsigned 32bit integer to hold our UNICODE characters
|
|
typedef uint32 unichar;
|
|
|
|
// a typedef for a string object to hold our UNICODE strings
|
|
typedef std::basic_string<unichar> ustring;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <typename T>
|
|
bool is_combining_char(
|
|
const T ch_
|
|
);
|
|
/*!
|
|
ensures
|
|
- if (ch_ is a unicode combining character) then
|
|
- returns true
|
|
- else
|
|
- returns false
|
|
!*/
|
|
|
|
bool is_surrogate(
|
|
unichar ch
|
|
);
|
|
/*!
|
|
ensures
|
|
- if (ch is a unicode surrogate character) then
|
|
- returns true
|
|
- else
|
|
- returns false
|
|
!*/
|
|
|
|
unichar surrogate_pair_to_unichar(
|
|
unichar first,
|
|
unichar second
|
|
);
|
|
/*!
|
|
requires
|
|
- 0xD800 <= first < 0xDC00
|
|
- 0xDC00 <= second < 0xE000
|
|
- is_surrogate(first) == true
|
|
- is_surrogate(second) == true
|
|
ensures
|
|
- converts two surrogates into one unicode character
|
|
!*/
|
|
|
|
void unichar_to_surrogate_pair(
|
|
unichar ch,
|
|
unichar& first,
|
|
unichar& second
|
|
);
|
|
/*!
|
|
requires
|
|
- ch >= 0x10000 (i.e. is not in Basic Multilingual Plane)
|
|
ensures
|
|
- surrogate_pair_to_unichar(#first,#second) == ch
|
|
(i.e. converts ch into two surrogate characters)
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
class invalid_utf8_error : public error
|
|
{
|
|
public:
|
|
invalid_utf8_error():error(EUTF8_TO_UTF32) {}
|
|
};
|
|
|
|
const ustring convert_utf8_to_utf32 (
|
|
const std::string& str
|
|
);
|
|
/*!
|
|
ensures
|
|
- if (str is a valid UTF-8 encoded string) then
|
|
- returns a copy of str that has been converted into a
|
|
unichar string
|
|
- else
|
|
- throws invalid_utf8_error
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const ustring convert_wstring_to_utf32 (
|
|
const std::wstring &wstr
|
|
);
|
|
/*!
|
|
requires
|
|
- wstr is a valid UTF-16 string when sizeof(wchar_t) == 2
|
|
- wstr is a valid UTF-32 string when sizeof(wchar_t) == 4
|
|
ensures
|
|
- converts wstr into UTF-32 string
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::wstring convert_utf32_to_wstring (
|
|
const ustring &str
|
|
);
|
|
/*!
|
|
requires
|
|
- str is a valid UTF-32 encoded string
|
|
ensures
|
|
- converts str into wstring whose encoding is UTF-16 when sizeof(wchar_t) == 2
|
|
- converts str into wstring whose encoding is UTF-32 when sizeof(wchar_t) == 4
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::wstring convert_mbstring_to_wstring (
|
|
const std::string &str
|
|
);
|
|
/*!
|
|
requires
|
|
- str is a valid multibyte string whose encoding is same as current locale setting
|
|
ensures
|
|
- converts str into wstring whose encoding is UTF-16 when sizeof(wchar_t) == 2
|
|
- converts str into wstring whose encoding is UTF-32 when sizeof(wchar_t) == 4
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::string convert_wstring_to_mbstring (
|
|
const std::wstring &src
|
|
);
|
|
/*!
|
|
requires
|
|
- str is a valid wide character string string whose encoding is same as current
|
|
locale setting
|
|
ensures
|
|
- returns a multibyte encoded version of the given string
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename charT
|
|
>
|
|
class basic_utf8_ifstream : public std::basic_istream<charT>
|
|
{
|
|
/*!
|
|
WHAT THIS OBJECT REPRESENTS
|
|
This object represents an input file stream much like the
|
|
normal std::ifstream except that it knows how to read UTF-8
|
|
data. So when you read characters out of this stream it will
|
|
automatically convert them from the UTF-8 multibyte encoding
|
|
into a fixed width wide character encoding.
|
|
!*/
|
|
|
|
public:
|
|
|
|
basic_utf8_ifstream (
|
|
);
|
|
/*!
|
|
ensures
|
|
- constructs an input stream that isn't yet associated with
|
|
a file.
|
|
!*/
|
|
|
|
basic_utf8_ifstream (
|
|
const char* file_name,
|
|
std::ios_base::openmode mode = std::ios::in
|
|
);
|
|
/*!
|
|
ensures
|
|
- tries to open the given file for reading by this stream
|
|
- mode is interpreted exactly the same was as the open mode
|
|
argument used by std::ifstream.
|
|
!*/
|
|
|
|
basic_utf8_ifstream (
|
|
const std::string& file_name,
|
|
std::ios_base::openmode mode = std::ios::in
|
|
);
|
|
/*!
|
|
ensures
|
|
- tries to open the given file for reading by this stream
|
|
- mode is interpreted exactly the same was as the open mode
|
|
argument used by std::ifstream.
|
|
!*/
|
|
|
|
void open(
|
|
const std::string& file_name,
|
|
std::ios_base::openmode mode = std::ios::in
|
|
);
|
|
/*!
|
|
ensures
|
|
- tries to open the given file for reading by this stream
|
|
- mode is interpreted exactly the same was as the open mode
|
|
argument used by std::ifstream.
|
|
!*/
|
|
|
|
void open (
|
|
const char* file_name,
|
|
std::ios_base::openmode mode = std::ios::in
|
|
);
|
|
/*!
|
|
ensures
|
|
- tries to open the given file for reading by this stream
|
|
- mode is interpreted exactly the same was as the open mode
|
|
argument used by std::ifstream.
|
|
!*/
|
|
|
|
void close (
|
|
);
|
|
/*!
|
|
ensures
|
|
- any file opened by this stream has been closed
|
|
!*/
|
|
};
|
|
|
|
typedef basic_utf8_ifstream<unichar> utf8_uifstream;
|
|
typedef basic_utf8_ifstream<wchar_t> utf8_wifstream;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_UNICODe_ABSTRACT_H_
|
|
|
|
|