/*--- Filename: "utf_eight.cpp" ---

  --- Projectname: Encoder/Decoder (UTF-8 Unicode4.0) FILE Functions ---
  (Targetsystem: Crossplatform)
  
  Author: Dennis Busch (http://www.dennisbusch.de)

  Content:
  Functions for storing and restoring full lines of
  UTF-8(as described in "Unicode4.0.0 Chapter 3.9") encoded UNICODE
  'string's , 'wstring's and 'uistring's to and from 'FILE's.

  Details:
    Unicode characters that can not be held in a single
    'memory cell' of the desired target string type will be read but ignored.

    Footnote1:
    "The definition of UTF-8 prohibits encoding character numbers between
     U+D800 and U+DFFF"
    So characters decoded into that range will be ignored as well and more
    important, characters from that range will not be encoded on storing.

    Footnote2:
    On storing, the ByteOrderMark(BOM) U+FEFF
    will be stored like any other character encoded in UTF-8,
    always leading to the octet sequence EF BB BF.
    On restoring, any sequence decoded to U+FEFF will be ignored, that is
    it gets not appended to the resulting string.

    Footnote3:
    The standard allows only unicodes of a maximum of 21
    bits to be encoded or decoded. Even though characters
    with the full range of 32bits *could* be stored in UTF-8 
    using up to 6 octets, this implementation does *NOT* do so.
    It only correctly encodes, decodes up to 4 octets, as defined by said
    standard("Unicode4.0.0 Chapter 3.9" UTF-8).
    Quote: "Unicode4.0.0 Appendix C.3"
      "those five- and six-byte sequences are illegal for the use 
       of UTF-8 as an encoding form of Unicode characters."

    Footnote4:
    This implementation also tries to protect against decoding of invalid
    octet sequences.
    It does so by silently reading any invalid sequences, but every such
    invalid sequence(as it should not be allowed in the first place) will
    be skipped and an ASCII '?' is given out as result character.
*/

#include "utf_eight.hpp"
#include <iostream>

#if !defined(__DB_utf_eight_BODY_INCLUDED)
#define __DB_utf_eight_BODY_INCLUDED

// Some Unicode character constants
#define prohibited_start 0xD800
#define prohibited_end 0xDFFF
// characters from the above range are not allowed to be stored as UTF-8

// codes using more than 21 bits are also not allowed to be stored as UTF-8
// (because the standard definition does not describe anything beyond 21 bits)
#define twentyone_bits 0x00200000
/* (10-0000-0000-0000-0000-0000)bin is (200000)hex is (2^21)dec */

#define BOM 0xFEFF // byte order mark
#define NL 0x0A // newline(ASCII)
#define CR 0x0D // carriage return(ASCII)

// Some UTF-8 encoding/decoding constants
#define octets_one 0x0   
#define octets_invalid_single 0x80 /* single sequence octets >= than this are
                                    not ASCII and thus invalid on decoding */
#define octets_two 0xC0   // (12*16)dec == (110xxxxx)bin
#define octets_three 0xE0 // (14*16)dec == (1110xxxx)bin
#define octets_four 0xF0  // (15*16)dec == (11110xxx)bin
#define octets_fup 0x80   //  (8*16)dec == (10xxxxxx)bin

#define one_range_max 0x0000007F 
#define two_range_max 0x000007FF
#define three_range_max 0x0000FFFF
#define four_range_max 0x0010FFFF

// Some "how many lowest bits are set" constants for UTF-8 encoding/decoding
#define three_set 0x07
#define four_set 0x0F
#define five_set 0x1F
#define six_set 0x3F
#define seven_set 0x7F

// All invalid octet values
#define invalid_C0 0xC0
#define invalid_C1 0xC1
#define invalid_oct_start 0xF5
#define invalid_oct_end 0xFF

// Explicitly Writing the UTF-8 ByteOrderMark(use only once! at begin of file)
int write_bom_utf8(FILE *out)
{
  return encode_utf8(out, BOM);
}

// Helper function to find invalid octet value
bool invalid_octet(int which)
{
  if(which==invalid_C0)
    return true;
  if(which==invalid_C1)
    return true;
  if((which>=invalid_oct_start)&&(which<=invalid_oct_end))
    return true;

  return false;
}
// Helper function to detect invalid octet value in all four octets
bool any_invalid_octet(unsigned char which[4])
{
  int i;
  // Generally invalid octet values?
  for(i=0; i<4; i++)
    if(invalid_octet(which[i]))
      return true;

  // Invalid follow ups (after so called 'lonely start characters')?
  for(i=1; i<4; i++)
    if(which[i] < octets_fup)
      return true;

  return false;
}

// Encoding a single UNICODE character to file
int encode_utf8(FILE *out, unsigned int code)
{
  if(out==NULL)
    return -1; // file error
  
  if((code>=prohibited_start)&&(code<=prohibited_end))
    return -1; // code not allowed by UTF-8 standard

  if(code>=twentyone_bits)
    return -1; // code uses more than 21 bits (invalid to store as UTF-8)

  unsigned char octet[4] = { 0, octets_fup , octets_fup, octets_fup };
  /*(the utf-8 standard allows only a maximum of four octets to be stored)*/
  unsigned char used_octets = 0;
  
  // Prepare the octets
  if(code <= one_range_max)
  {
    used_octets = 1;
    //copy the lowest 7 bits from code into first octet
    octet[0] = code & seven_set;
  }
  else if(code <= two_range_max)
  {
    used_octets = 2;
    // Initialise most significant bits of encoded sequence
    octet[0] = octets_two;

    // Paste the 5 most significant bits from the code to first octet
    octet[0] = octet[0] | ((code >> 6) & five_set);
    
    // Paste the 6 least significant bits from the code to second octet
    octet[1] = octet[1] | (code & six_set);
  }
  else if(code <= three_range_max)
  {
    used_octets = 3;
    // Initialise most significant bits of encoded sequence
    octet[0] = octets_three;

    // Paste the 4 most significant bits from the code to first octet
    octet[0] = octet[0] | ((code >> 12) & four_set);

    // Paste the next 6 bits from code to second octet
    octet[1] = octet[1] | ((code >> 6) & six_set);

    // Paste the 6 least significant bits from the code to third octet
    octet[2] = octet[2] | (code & six_set);
  }
  else if(code <= four_range_max)
  {
    used_octets = 4;
    // Initialise most significant bits of encoded sequence
    octet[0] = octets_four;

    // Paste the 3 most significant bits from the code to first octet
    octet[0] = octet[0] | ((code >> 18) & three_set);

    // Paste the next 6 bits from the code to second octet
    octet[1] = octet[1] | ((code >> 12) & six_set);

    // Paste the next 6 bits from the code to thir octet
    octet[2] = octet[2] | ((code >> 6) & six_set);

    // Paste the 6 least significant bits from the code to fourth octet
    octet[3] = octet[3] | (code & six_set);
  }

  // Now write these octets to the file
  if(fwrite(&octet,sizeof(unsigned char),used_octets,out) < used_octets)
    return -1; // error during fwrite
  else
    return 0;
}

// Decoding a single UNICODE character from file
// (invalid octet sequences will be skipped and handed out as ASCII '?')
int decode_utf8(FILE *in, unsigned int *result)
{
  if((in==NULL)||(result==NULL))
    return -1; // file error or invalid result address

  unsigned char octet[4] = { 0, octets_fup , octets_fup, octets_fup };
  /*(the utf-8 standard allows only a maximum of four octets to be restored)*/
  unsigned char octets_used = 0;
  unsigned int decoded = 0;
  bool overlong_sequence = false;

  // Read in the first octet
  if(fread(&octet[0],sizeof(unsigned char),1,in)!=1)
    return -1; // file error

// Protect against decoding invalid octet sequences(stage 1 of 3)
  if(invalid_octet(octet[0]))// invalid first octet
  {
    *result = 63; // '?'
    return 0;
  }

  // Read additional octets
  if(octet[0] >= octets_four) // three more octets expected (4 total)
  {
    if(fread(&octet[1],sizeof(unsigned char),3,in)!=3)
      return -1; // file error
    octets_used = 4;

    /* Copy in the 3 least significant bits from first octet and shift
       them into correct position */
    decoded += (octet[0] & three_set); 
    decoded = decoded << 6;
    /* Copy in the 6 least significant bits from remaining octets and shift
       them into correct position */
    decoded += (octet[1] & six_set); 
    decoded = decoded << 6;
    decoded += (octet[2] & six_set); 
    decoded = decoded << 6;
    decoded += octet[3] & six_set;

    if(decoded <= three_range_max)
      overlong_sequence = true;
  }
  else if (octet[0] >= octets_three) // two more octets expected (3 total)
  {
    if(fread(&octet[1],sizeof(unsigned char),2,in)!=2)
      return -1; // file error
    octets_used = 3;

    /* Copy in the 4 least significant bits from first octet and shift
       them into correct position */
    decoded += (octet[0] & four_set); 
    decoded = decoded << 6;
    /* Copy in the 6 least significant bits from remaining octets and shift
       them into correct position */
    decoded += (octet[1] & six_set); 
    decoded = decoded << 6;
    decoded += octet[2] & six_set;
    
    if(decoded <= two_range_max)
      overlong_sequence = true;
  }
  else if (octet[0] >= octets_two) // one more octet expected (2 total)
  {
    if(fread(&octet[1],sizeof(unsigned char),1,in)!=1)
      return -1; // file error
    octets_used = 2;

    /* Copy in the 5 least significant bits from first octet and shift
       them into correct position */
    decoded += (octet[0] & five_set); 
    decoded = decoded << 6;
    // Copy in the 6 least significant bits from second octet
    decoded += octet[1] & six_set;

    if(decoded <= one_range_max)
      overlong_sequence = true;
  }
  else if(octet[0] >= octets_invalid_single)
// Protect against decoding invalid octet sequences(stage 2 of 3)
  {
    /* a single octet sequence was expected but its' value is non ASCII(>127)
       and by that it is invalid to decode and use it */
    *result = 63; // '?'
    return 0;
  }
  else // first octet is valid single ASCII character
  {
    octets_used = 1;
    decoded += octet[0];
  }

// Protect against decoding invalid octet sequences(stage 3 of 3)
  if(any_invalid_octet(octet)) // other invalid octet found on decoding!
  {
    *result = 63; // '?'
    return 0;
  }
  if(overlong_sequence) // decoded character was not encoded correctly!
  {
    *result = 63; // '?'
    return 0;
  }
  if((octets_used > 1)&&(decoded == 0))// "hidden" null character decoded!
  {
    *result = 63; // '?'
    return 0;
  }
  if((decoded>=prohibited_start)&&(decoded<=prohibited_end))
  {  /* found a decoded code that should not have been allowed 
        to be encoded as UTF-8 in the first place! */
    *result = 63; // '?'
    return 0;
  }

/* If the execution reaches this line, 'decoded' should hold a Unicode
   character number(max 21 bits wide),
   decoded from a valid octet sequence, that was also valid
   to be encoded as UTF-8. */

  // Hand over the decoded character
  *result = decoded;

  return 0;
}

// Reading string and wstring from file 
/* (until the first newline appears, 
    the newline character itself is not appended) */
int readline_utf8(FILE *in, string *result)
{
  if((in==NULL)||(result==NULL))
    return -1; // file error or invalid result address

  string dec_line;
  unsigned int dec_code = 0;
  char to_put[2] = { 0, 0 };

  while(decode_utf8(in,&dec_code) == 0)
  {
    if((dec_code == NL)||(dec_code == CR)||(dec_code == BOM))
      // newline or carriage return or byte order mark found
      if(!dec_line.empty()) // already read something, so stop
        break;
      else; /* not read something yet, so ignore CR, NL, BOM
               (here to inteprete the typical windows CR+NL combination to be
                seen as just a single end of line mark, so basically all
                newlines or carriage returns (empty lines) before any actual
                character data will be ignored) */
    else if(dec_code <= one_range_max) // valid ASCII found?
    { 
      to_put[0] = dec_code; // possible loss on conversion is intended here
      to_put[1] = '\0';
      dec_line.append(to_put);
    } // everything else will be ignored, because it does not fit
  }
  
  if(dec_line.empty()) // did not read anything at all?
    return -1; // report error
  else // hand over the read line
    result->assign(dec_line);

  return 0;
}

int readline_utf8(FILE *in, wstring *result)
{
  if((in==NULL)||(result==NULL))
    return -1; // file error or invalid result address

  wstring dec_line;
  unsigned int dec_code = 0;
  wstring::value_type to_put[2] = { 0, 0 };

  static bool max_wc_val_inited = false;
  static wstring::value_type max_wc_val;
  // Fully fill all bits of max_wc_val to find out the actual max value
  if (!max_wc_val_inited)
  {
    unsigned int i;
    wstring::value_type fill_8 = 0xFF;
    max_wc_val = 0;
    for(i=0; i<sizeof(wstring::value_type); i++)
      max_wc_val = (max_wc_val | fill_8) << 8;

    max_wc_val = max_wc_val | fill_8 ;
    max_wc_val_inited = true;
  }

  while(decode_utf8(in,&dec_code) == 0)
  {
    if((dec_code == NL)||(dec_code == CR)||(dec_code == BOM))
      // newline or carriage return or byte order mark found
      if(!dec_line.empty()) // already read something, so stop
        break;
      else; /* not read something yet, so ignore CR, NL, BOM
               (here to inteprete the typical windows CR+NL combination to be
                seen as just a single end of line mark, so basically all
                newlines or carriage returns (empty lines) before any actual
                character data will be ignored) */
    else if(dec_code <= max_wc_val) // valid wide-character found?
    { 
      to_put[0] = dec_code; // possible loss on conversion is intended here
      to_put[1] = L'\0';
      dec_line.append(to_put);
    } // everything else will be ignored, because it does not fit
  }
  
  if(dec_line.empty()) // did not read anything at all?
    return -1; // report error
  else // hand over the read line
    result->assign(dec_line);

  return 0;
}

// Writing string and wstring to file (also writes a newline after the string)
int writeline_utf8(FILE *out, string line)
{
  if(out==NULL)
    return -1; // file error
 
  if(line.empty())
    return -1; // nothing to store

  int i=0;
  int len=line.size();
  unsigned int to_write = 0;

  // Encode and write all characters of line
  for(i=0; i<len; i++)
  {
    to_write = line.at(i);
    if(encode_utf8(out, to_write) < 0)
      return -1; // file error
  }
  // Finally encode and write a carriage return and a newline character
  if(encode_utf8(out, CR) < 0)
    return -1; // file error
  if(encode_utf8(out, NL) < 0)
    return -1; // file error

  return 0;
}

int writeline_utf8(FILE *out, wstring line)
{
  if(out==NULL)
    return -1; // file error

  if(line.empty())
    return -1; // nothing to store

  int i=0;
  int len=line.size();
  unsigned int to_write = 0;

  // Encode and write all characters of line
  for(i=0; i<len; i++)
  {
    to_write = line.at(i);
    if(encode_utf8(out, to_write) < 0)
      return -1; // file error
  }
  // Finally encode and write a carriage return and a newline character
  if(encode_utf8(out, CR) < 0)
    return -1; // file error
  if(encode_utf8(out, NL) < 0)
    return -1; // file error 

  return 0;
}

// Functions for strings that can store the full range of characters
int writeline_utf8(FILE *out, uistring line)
{
  if(out==NULL)
    return -1; // file error

  if(line.empty())
    return -1; // nothing to store

  int i=0;
  int len=line.size();
  unsigned int to_write = 0;

  // Encode and write all characters of line
  for(i=0; i<len; i++)
  {
    to_write = line.at(i);
    if(encode_utf8(out, to_write) < 0)
      return -1; // file error
  }
  // Finally encode and write a carriage return and a newline character
  if(encode_utf8(out, CR) < 0)
    return -1; // file error
  if(encode_utf8(out, NL) < 0)
    return -1; // file error 

  return 0;
}

int readline_utf8(FILE *in, uistring *result)
{
  if((in==NULL)||(result==NULL))
    return -1; // file error or invalid result address

  uistring dec_line;
  unsigned int dec_code = 0;
  uistring::value_type to_put[2] = { 0, 0 };

  while(decode_utf8(in,&dec_code) == 0)
  {
    if((dec_code == NL)||(dec_code == CR)||(dec_code == BOM))
      // newline or carriage return or byte order mark found
      if(!dec_line.empty()) // already read something, so stop
        break;
      else; /* not read something yet, so ignore CR, NL, BOM
               (here to inteprete the typical windows CR+NL combination to be
                seen as just a single end of line mark, so basically all
                newlines or carriage returns (empty lines) before any actual
                character data will be ignored) */
    else // since uistring has same type as dec_code, it always fits
    { 
      to_put[0] = dec_code;
      to_put[1] = 0;
      dec_line.append(to_put);
    } // everything else will be ignored, because it does not fit
  }
  
  if(dec_line.empty()) // did not read anything at all?
    return -1; // report error
  else // hand over the read line
    result->assign(dec_line);

  return 0;
}

#endif // #if !defined(__DB_utf_eight_BODY_INCLUDED)

/*
  Preserving the possibilty to make nicely formatted printouts
  (Format: "Portrait"), the code should be normed to a width of 78 chars.
123456789012345678901234567890123456789012345678901234567890123456789012345678
---------10--------20--------30--------40--------50--------60--------70-----78
*/
