Logo Search packages:      
Sourcecode: qpdf version File versions  Download package

QPDFTokenizer.cc

#include <qpdf/QPDFTokenizer.hh>

// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.

#include <qpdf/PCRE.hh>
#include <qpdf/QEXC.hh>
#include <qpdf/QTC.hh>

#include <string.h>

// See note above about ctype.
static bool is_hex_digit(char ch)
{
    return (strchr("0123456789abcdefABCDEF", ch) != 0);
}

QPDFTokenizer::QPDFTokenizer() :
    pound_special_in_name(true)
{
    reset();
}

void
QPDFTokenizer::allowPoundAnywhereInName()
{
    QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name");
    this->pound_special_in_name = false;
}

void
QPDFTokenizer::reset()
{
    state = st_top;
    type = tt_bad;
    val = "";
    raw_val = "";
    error_message = "";
    unread_char = false;
    char_to_unread = '\0';
    string_depth = 0;
    string_ignoring_newline = false;
    last_char_was_bs = false;
}

void
QPDFTokenizer::presentCharacter(char ch)
{
    static PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$");

    if (state == st_token_ready)
    {
      throw QEXC::Internal("QPDF tokenizer presented character "
                       "while token is waiting");
    }

    char orig_ch = ch;

    // State machine is implemented such that some characters may be
    // handled more than once.  This happens whenever you have to use
    // the character that caused a state change in the new state.

    bool handled = true;
    if (state == st_top)
    {
      // Note: we specifically do not use ctype here.  It is
      // locale-dependent.
      if (strchr(" \t\n\v\f\r", ch))
      {
          // ignore
      }
      else if (ch == '%')
      {
          // Discard comments
          state = st_in_comment;
      }
      else if (ch == '(')
      {
          string_depth = 1;
          string_ignoring_newline = false;
          memset(bs_num_register, '\0', sizeof(bs_num_register));
          last_char_was_bs = false;
          state = st_in_string;
      }
      else if (ch == '<')
      {
          state = st_lt;
      }
      else if (ch == '>')
      {
          state = st_gt;
      }
      else
      {
          val += ch;
          if (ch == ')')
          {
            type = tt_bad;
            QTC::TC("qpdf", "QPDF_Tokenizer bad )");
            error_message = "unexpected )";
            state = st_token_ready;
          }
          else if (ch == '[')
          {
            type = tt_array_open;
            state = st_token_ready;
          }
          else if (ch == ']')
          {
            type = tt_array_close;
            state = st_token_ready;
          }
          else if (ch == '{')
          {
            type = tt_brace_open;
            state = st_token_ready;
          }
          else if (ch == '}')
          {
            type = tt_brace_close;
            state = st_token_ready;
          }
          else
          {
            state = st_literal;
          }
      }
    }
    else if (state == st_in_comment)
    {
      if ((ch == '\r') || (ch == '\n'))
      {
          state = st_top;
      }
    }
    else if (state == st_lt)
    {
      if (ch == '<')
      {
          val = "<<";
          type = tt_dict_open;
          state = st_token_ready;
      }
      else
      {
          handled = false;
          state = st_in_hexstring;
      }
    }
    else if (state == st_gt)
    {
      if (ch == '>')
      {
          val = ">>";
          type = tt_dict_close;
          state = st_token_ready;
      }
      else
      {
          val = ">";
          type = tt_bad;
          QTC::TC("qpdf", "QPDF_Tokenizer bad >");
          error_message = "unexpected >";
          unread_char = true;
          char_to_unread = ch;
          state = st_token_ready;
      }
    }
    else if (state == st_in_string)
    {
      if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n'))))
      {
          string_ignoring_newline = false;
      }

      unsigned int bs_num_count = strlen(bs_num_register);
      bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
      if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal)))
      {
          // We've accumulated \ddd.  PDF Spec says to ignore
          // high-order overflow.
          val += (char) strtol(bs_num_register, 0, 8);
          memset(bs_num_register, '\0', sizeof(bs_num_register));
          bs_num_count = 0;
      }

      if (string_ignoring_newline && ((ch == '\r') || (ch == '\n')))
      {
          // ignore
      }
      else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0)))
      {
          bs_num_register[bs_num_count++] = ch;
      }
      else if (last_char_was_bs)
      {
          switch (ch)
          {
            case 'n':
            val += '\n';
            break;

            case 'r':
            val += '\r';
            break;

            case 't':
            val += '\t';
            break;

            case 'b':
            val += '\b';
            break;

            case 'f':
            val += '\f';
            break;

            case '\r':
            case '\n':
            string_ignoring_newline = true;
            break;

            default:
            // PDF spec says backslash is ignored before anything else
            val += ch;
            break;
          }
      }
      else if (ch == '\\')
      {
          // last_char_was_bs is set/cleared below as appropriate
          if (bs_num_count)
          {
            throw QEXC::Internal("QPDFTokenizer: bs_num_count != 0 "
                             "when ch == '\\'");
          }
      }
      else if (ch == '(')
      {
          val += ch;
          ++string_depth;
      }
      else if ((ch == ')') && (--string_depth == 0))
      {
          type = tt_string;
          state = st_token_ready;
      }
      else
      {
          val += ch;
      }

      last_char_was_bs = ((! last_char_was_bs) && (ch == '\\'));
    }
    else if (state == st_literal)
    {
      if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0)
      {
          // A C-loacle whitespace character or delimiter terminates
          // token.  It is important to unread the whitespace
          // character even though it is ignored since it may be the
          // newline after a stream keyword.  Removing it here could
          // make the stream-reading code break on some files,
          // though not on any files in the test suite as of this
          // writing.

          type = tt_word;
          unread_char = true;
          char_to_unread = ch;
          state = st_token_ready;
      }
      else
      {
          val += ch;
      }
    }
    else
    {
      handled = false;
    }


    if (handled)
    {
      // okay
    }
    else if (state == st_in_hexstring)
    {
      if (ch == '>')
      {
          type = tt_string;
          state = st_token_ready;
          if (val.length() % 2)
          {
            // PDF spec says odd hexstrings have implicit
            // trailing 0.
            val += '0';
          }
          char num[3];
          num[2] = '\0';
          std::string nval;
          for (unsigned int i = 0; i < val.length(); i += 2)
          {
            num[0] = val[i];
            num[1] = val[i+1];
            char nch = (char)(strtol(num, 0, 16));
            nval += nch;
          }
          val = nval;
      }
      else if (is_hex_digit(ch))
      {
          val += ch;
      }
      else if (strchr(" \t\n\v\f\r", ch))
      {
          // ignore
      }
      else
      {
          type = tt_bad;
          QTC::TC("qpdf", "QPDF_Tokenizer bad (");
          error_message = std::string("invalid character (") +
            ch + ") in hexstring";
          state = st_token_ready;
      }
    }
    else
    {
      throw QEXC::Internal("invalid state while reading token");
    }

    if ((state == st_token_ready) && (type == tt_word))
    {
      if ((val.length() > 0) && (val[0] == '/'))
      {
          type = tt_name;
          // Deal with # in name token.  Note: '/' by itself is a
          // valid name, so don't strip leading /.  That way we
          // don't have to deal with the empty string as a name.
          std::string nval = "/";
          char const* valstr = val.c_str() + 1;
          for (char const* p = valstr; *p; ++p)
          {
            if ((*p == '#') && this->pound_special_in_name)
            {
                if (p[1] && p[2] &&
                  is_hex_digit(p[1]) && is_hex_digit(p[2]))
                {
                  char num[3];
                  num[0] = p[1];
                  num[1] = p[2];
                  num[2] = '\0';
                  char ch = (char)(strtol(num, 0, 16));
                  if (ch == '\0')
                  {
                      type = tt_bad;
                      QTC::TC("qpdf", "QPDF_Tokenizer null in name");
                      error_message =
                        "null character not allowed in name token";
                      nval += "#00";
                  }
                  else
                  {
                      nval += ch;
                  }
                  p += 2;
                }
                else
                {
                  QTC::TC("qpdf", "QPDF_Tokenizer bad name");
                  type = tt_bad;
                  error_message = "invalid name token";
                  nval += *p;
                }
            }
            else
            {
                nval += *p;
            }
          }
          val = nval;
      }
      else if (num_re.match(val.c_str()))
      {
          if (val.find('.') != std::string::npos)
          {
            type = tt_real;
          }
          else
          {
            type = tt_integer;
          }
      }
      else if ((val == "true") || (val == "false"))
      {
          type = tt_bool;
      }
      else if (val == "null")
      {
          type = tt_null;
      }
      else
      {
          // I don't really know what it is, so leave it as tt_word.
          // Lots of cases ($, #, etc.) other than actual words fall
          // into this category, but that's okay at least for now.
          type = tt_word;
      }
    }

    if (! (betweenTokens() || ((state == st_token_ready) && unread_char)))
    {
      this->raw_val += orig_ch;
    }
}

void
QPDFTokenizer::presentEOF()
{
    switch (state)
    {
      case st_token_ready:
      case st_top:
      // okay
      break;

      case st_in_comment:
      state = st_top;
      break;

      default:
      type = tt_bad;
      error_message = "EOF while reading token";
      state = st_token_ready;
    }
}

bool
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
{
    bool ready = (this->state == st_token_ready);
    unread_char = this->unread_char;
    ch = this->char_to_unread;
    if (ready)
    {
      token = Token(type, val, raw_val, error_message);
      reset();
    }
    return ready;
}

bool
QPDFTokenizer::betweenTokens()
{
    return ((state == st_top) || (state == st_in_comment));
}

Generated by  Doxygen 1.6.0   Back to index