/*--- Filename: "utf_eight.hpp" ---

  --- Projectname: Encoder/Decoder (UTF-8 Unicode4.0) FILE Functions ---
  (Targetsystem: Crossplatform)
  
  Author: Dennis Busch (http://www.dennisbusch.de)

  Content:
  Functions for storing and restoring full lines of
  UTF-8(as described in "Unicode4.0.0 Chapter 3.9") encoded UNICODE
  'string's , 'wstring's and 'uistring's to and from 'FILE's.

  Details:
    Unicode characters that can not be held in a single
    'memory cell' of the desired target string type will be read but ignored.

    Footnote1:
    "The definition of UTF-8 prohibits encoding character numbers between
     U+D800 and U+DFFF"
    So characters decoded into that range will be ignored as well and more
    important, characters from that range will not be encoded on storing.

    Footnote2:
    On storing, the ByteOrderMark(BOM) U+FEFF
    will be stored like any other character encoded in UTF-8,
    always leading to the octet sequence EF BB BF.
    On restoring, any sequence decoded to U+FEFF will be ignored, that is
    it gets not appended to the resulting string.

    Footnote3:
    The standard allows only unicodes of a maximum of 21
    bits to be encoded or decoded. Even though characters
    with the full range of 32bits *could* be stored in UTF-8 
    using up to 6 octets, this implementation does *NOT* do so.
    It only correctly encodes, decodes up to 4 octets, as defined by said
    standard("Unicode4.0.0 Chapter 3.9" UTF-8).
    Quote: "Unicode4.0.0 Appendix C.3"
      "those five- and six-byte sequences are illegal for the use 
       of UTF-8 as an encoding form of Unicode characters."

    Footnote4:
    This implementation also tries to protect against decoding of invalid
    octet sequences.
    It does so by silently reading any invalid sequences, but every such
    invalid sequence(as it should not be allowed in the first place) will
    be skipped and an ASCII '?' is given out as result character.
*/

#include <cstdio>
#include <cstdlib>
#include <string>
using namespace std;

#if !defined(__DB_utf_eight_HEAD_INCLUDED)
#define __DB_utf_eight_HEAD_INCLUDED

/* - All functions return a negative value on failure or if the result
     would be empty.
   - The FILE parameter is always expected to be a valid 
     file pointer and it must be opened in binary mode. */


/* Explicitly Writing the UTF-8 ByteOrderMark
   (use only once at begin of file) as a signature for external editors */
int write_bom_utf8(FILE *out);

// Encoding and Decoding a single UNICODE character to and from file
int encode_utf8(FILE *out, unsigned int code);
int decode_utf8(FILE *in, unsigned int *result);

// Reading string and wstring from file 
/* Starts reading actual character data after ignoring any preceeding newline
   or carriage return codes or byte order marks
   then reads characters until the next newline or carriage return appears, 
   the nl or cr character itself is not appended to result) */
/* (basically means two things: Empty lines will be skipped and windows' CR+NL
    will not lead to reading extra empty lines.) */
/* (result will be replaced with the read line if return value is 0, otherwise
    result will not be altered in any way) */
int readline_utf8(FILE *in,  string *result);
int readline_utf8(FILE *in, wstring *result);

// Writing string and wstring to file (also writes CR+NL after the string)
/* (if FILE is not in binary mode, some platforms(like windows) will do
    a conversion from NL to CR+NL, so then actually CR+CR+NL will be written,
    which is not really a problem for the readline functions above, this info
    is just here, so you do not have to get confused about that, if you should
    ever take a closer peek at the output file) */
int writeline_utf8(FILE *out,  string line);
int writeline_utf8(FILE *out, wstring line);

// Functions for strings that can store the full range of characters
typedef basic_string<unsigned int> uistring;
int readline_utf8(FILE *in, uistring *result);
int writeline_utf8(FILE *out, uistring line);

#endif // #if !defined(__DB_utf_eight_HEAD_INCLUDED)

/*
  Preserving the possibilty to make nicely formatted printouts
  (Format: "Portrait"), the code should be normed to a width of 78 chars.
123456789012345678901234567890123456789012345678901234567890123456789012345678
---------10--------20--------30--------40--------50--------60--------70-----78
*/
