00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 #ifndef COMMA_PARSER_LEXER_HDR_GUARD
00010 #define COMMA_PARSER_LEXER_HDR_GUARD
00011 
00012 #include "comma/basic/Diagnostic.h"
00013 #include "comma/basic/TextProvider.h"
00014 #include <iosfwd>
00015 #include <string>
00016 
00017 namespace comma {
00018 
00019 class Lexer {
00020 
00021 public:
00022     Lexer(TextProvider &txtProvider, Diagnostic &diag);
00023 
00024     
00025     
00026     
00027     
00028     
00029     
00030     enum Code {
00031         UNUSED_ID,
00032 
00033 #define RESERVED(NAME, STRING) TKN_ ## NAME,
00034 #define GLYPH(NAME, STRING)    TKN_ ## NAME,
00035 #define TOKEN(NAME)            TKN_ ## NAME,
00036 #include "comma/parser/Tokens.def"
00037 #undef RESERVED
00038 #undef GLYPH
00039 #undef TOKEN
00040 
00041         NUMTOKEN_CODES
00042     };
00043 
00044     
00045     
00046     
00047     
00048     
00049     class Token {
00050 
00051     public:
00052         Token() : code(Lexer::UNUSED_ID) { }
00053 
00054         Lexer::Code getCode() const { return code; }
00055 
00056         Location getLocation() const { return location; }
00057 
00058         const char *getRep() const { return string; }
00059 
00060         unsigned getLength() const { return length; }
00061 
00062         
00063         std::string getString() const;
00064 
00065     private:
00066         Lexer::Code code   : 8;
00067         unsigned    length : 24;
00068         Location    location;
00069         const char *string;
00070 
00071         
00072         
00073         friend class Lexer;
00074 
00075         Token(Lexer::Code code,
00076               Location    location,
00077               const char *string,
00078               unsigned length)
00079             : code(code),
00080               length(length),
00081               location(location),
00082               string(string) { }
00083     };
00084 
00085     
00086     
00087     
00088     void scan(Lexer::Token &tkn);
00089 
00090     void peek(Lexer::Token &tkn, unsigned n);
00091 
00092     
00093     
00094     
00095     
00096     
00097     void beginExcursion();
00098 
00099     void endExcursion();
00100 
00101     void forgetExcursion();
00102 
00103     
00104     bool lexSuccessful() const { return errorCount == 0; }
00105 
00106     
00107     unsigned getErrorCount() const { return errorCount; }
00108 
00112     void abortScanning() { scanningAborted = true; }
00113 
00114     
00115     
00116     static bool isFunctionGlyph(const Lexer::Token &tkn) {
00117         switch (tkn.getCode()) {
00118         case TKN_EQUAL:
00119         case TKN_NEQUAL:
00120         case TKN_LESS:
00121         case TKN_LEQ:
00122         case TKN_GREAT:
00123         case TKN_GEQ:
00124         case TKN_MINUS:
00125         case TKN_STAR:
00126         case TKN_PLUS:
00127         case TKN_FSLASH:
00128         case TKN_POW:
00129         case TKN_MOD:
00130         case TKN_REM:
00131         case TKN_AND:
00132         case TKN_NOT:
00133         case TKN_XOR:
00134         case TKN_OR:
00135             return true;
00136         default:
00137             return false;
00138         }
00139     }
00140 
00141     
00142     
00143     static const char *tokenString(Code code);
00144 
00145     
00146     static std::string tokenString(const Token &tkn);
00147 
00148 private:
00149     void scanToken();
00150 
00151     bool eatWhitespace();
00152 
00153     bool eatComment();
00154 
00155     bool scanWord();
00156 
00157     bool scanGlyph();
00158 
00159     bool scanCharacter();
00160 
00161     bool scanString();
00162 
00163     bool scanNumeric();
00164 
00165     bool scanEscape();
00166 
00167     static bool isAlphabetic(unsigned c);
00168 
00169     static bool isInitialIdentifierChar(unsigned c);
00170 
00171     static bool isInnerIdentifierChar(unsigned c);
00172 
00173     static bool isWhitespace(unsigned c);
00174 
00175     static bool isDecimalDigit(unsigned c);
00176 
00177     Location currentLocation() const;
00178 
00179     
00180     
00181     
00182     
00183     
00184     unsigned readStream();
00185     unsigned peekStream();
00186     void ungetStream();
00187     void ignoreStream();
00188 
00189     
00190     
00191     
00192     
00193     Code getTokenCode(TextIterator &start, TextIterator &end) const;
00194 
00195     void emitToken(Code code,
00196                    const TextIterator &start, const TextIterator &end);
00197 
00198     
00199     
00200     void emitToken(Code code, Location loc);
00201 
00202     void emitStringToken(const TextIterator &start, const TextIterator &end);
00203 
00204     void emitIntegerToken(const TextIterator &start, const TextIterator &end);
00205 
00206     void emitIdentifierToken(const TextIterator &start,
00207                              const TextIterator &end);
00208 
00209     void emitCharacterToken(const TextIterator &start, const TextIterator &end);
00210 
00211     DiagnosticStream &report(Location loc, diag::Kind kind) {
00212         ++errorCount;
00213         SourceLocation sloc = txtProvider.getSourceLocation(loc);
00214         return diagnostic.report(sloc, kind);
00215     }
00216 
00217     
00218     
00219     void diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2);
00220 
00221     DiagnosticStream &report(SourceLocation sloc, diag::Kind kind)  {
00222         ++errorCount;
00223         return diagnostic.report(sloc, kind);
00224     }
00225 
00226     DiagnosticStream &report(diag::Kind kind) {
00227         ++errorCount;
00228         SourceLocation sloc = txtProvider.getSourceLocation(currentLocation());
00229         return diagnostic.report(sloc, kind);
00230     }
00231 
00232     
00233     TextProvider &txtProvider;
00234 
00235     
00236     Diagnostic &diagnostic;
00237 
00238     
00239     TextIterator currentIter;
00240 
00241     
00242     unsigned errorCount;
00243 
00244     
00245     bool scanningAborted;
00246 
00247     
00248     
00249     Token *targetToken;
00250 
00251     
00252     std::vector<Token> tokens;
00253 
00254     
00255     
00256     std::vector<unsigned> positionStack;
00257 
00258     
00259     
00260     unsigned index;
00261 };
00262 
00263 } 
00264 
00265 #endif