View Javadoc
1   package org.csveed.token;
2   
3   import static org.csveed.token.ParseState.*;
4   
5   import org.csveed.common.Column;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   /**
10  * Yep, a state machine. Managing all kinds of booleans to form a pseudo-state doesn't work really well
11  * whereas a state machine does. The state machine takes one character at a time, checks routes to the new
12  * state if necessary and holds tokens, which it returns whenever a field-end ('popToken') has been found.
13  * @author Robert Bor
14  */
15  public class ParseStateMachine {
16  
17      private static final Logger LOG = LoggerFactory.getLogger(ParseStateMachine.class);
18  
19      private ParseState state = START_OF_LINE;
20  
21      private StringBuilder token = new StringBuilder();
22  
23      private int charactersRead;
24  
25      private SymbolMappingping">SymbolMapping symbolMapping = new SymbolMapping();
26  
27      private TokenState tokenState = TokenState.RESET;
28  
29      private boolean trim = true;
30  
31      private boolean trash;
32  
33      private ColumnColumn currentColumn = new Column();
34  
35      private int currentLine = 1;
36  
37      private int newLine = currentLine;
38  
39      public int getCurrentLine() {
40          return this.currentLine;
41      }
42  
43      public int getCurrentColumn() {
44          return this.currentColumn.getColumnIndex();
45      }
46  
47      public boolean isTrash() {
48          return this.trash;
49      }
50  
51      public String offerSymbol(int symbolCharacter) throws ParseException {
52  
53          this.trash = false;
54  
55          EncounteredSymbol symbol = symbolMapping.find(symbolCharacter, state);
56  
57          if (symbol.isTrash()) {
58              this.trash = true;
59              return null;
60          }
61  
62          if (isFinished()) {
63              throw new ParseException(state, symbolCharacter, symbol);
64          }
65  
66          if (currentLine != newLine) {
67              state = START_OF_LINE;
68              charactersRead = 0;
69              currentColumn = currentColumn.nextLine();
70              currentLine = newLine;
71          }
72  
73          if (currentLine < symbolMapping.getStartLine()) {
74              state = SKIP_LINE;
75          }
76  
77          if (tokenState.isStart()) {
78              tokenState = tokenState.next();
79          }
80  
81          ParseState newState = determineState(symbolCharacter, symbol);
82          LOG.debug("{} ({}): {} => {}", (char)symbolCharacter, symbol, state, newState);
83  
84          if (newState.isTokenize()) {
85              if (tokenState.isReset()) {
86                  trim = newState.trim();
87                  tokenState = tokenState.next();
88              }
89              token.append((char)symbolCharacter);
90          }
91          String returnToken = null;
92  
93          if (newState.isPopToken()) {
94              returnToken = token.toString();
95              if (trim) {
96                  returnToken = returnToken.trim();
97              }
98              token = new StringBuilder();
99              tokenState = tokenState.next();
100             currentColumn = currentColumn.nextColumn();
101         }
102 
103         if (newState.isLineFinished()) {
104             newLine++;
105         } else {
106             charactersRead++;
107         }
108 
109         state = newState;
110 
111         return returnToken;
112     }
113 
114     public boolean isTokenStart() {
115         return tokenState.isStart();
116     }
117 
118     public boolean isLineFinished() {
119         return state.isLineFinished();
120     }
121 
122     public boolean isFinished() {
123         return state == FINISHED;
124     }
125 
126     public boolean ignoreLine() {
127         return state.isIgnore() || isEmptyLine();
128     }
129 
130     public boolean isEmptyLine() {
131         return charactersRead == 0;
132     }
133 
134     protected ParseState determineState(int symbolCharacter, EncounteredSymbol symbol) throws ParseException {
135 
136         switch (state) {
137             case SKIP_LINE:
138                 switch(symbol) {
139                     case EOL_SYMBOL:
140                         return SKIP_LINE_FINISHED;
141                     case END_OF_FILE_SYMBOL:
142                         return FINISHED;
143                     default:
144                         return SKIP_LINE;
145                 }
146             case COMMENT_LINE:
147                 switch(symbol) {
148                     case EOL_SYMBOL:
149                         return COMMENT_LINE_FINISHED;
150                     case END_OF_FILE_SYMBOL:
151                         return FINISHED;
152                     default:
153                         return COMMENT_LINE;
154                 }
155             case START_OF_LINE:
156                 switch(symbol) {
157                     case COMMENT_SYMBOL:
158                         if (symbolMapping.isSkipCommentLines()) {
159                             return COMMENT_LINE;
160                         }
161                 } // Fallthrough intentional
162             case SEPARATOR:
163                 switch(symbol) {
164                     case SPACE_SYMBOL:
165                         return OUTSIDE_BEFORE_FIELD;
166                     case QUOTE_SYMBOL:
167                         return FIRST_CHAR_INSIDE_QUOTED_FIELD;
168                     case SEPARATOR_SYMBOL :
169                         return SEPARATOR;
170                     case END_OF_FILE_SYMBOL:
171                         return FINISHED;
172                     case EOL_SYMBOL :
173                         return LINE_FINISHED;
174                     default :
175                         return INSIDE_FIELD;
176                 }
177             case OUTSIDE_BEFORE_FIELD:
178                 switch(symbol) {
179                     case SPACE_SYMBOL:
180                         return OUTSIDE_BEFORE_FIELD;
181                     case SEPARATOR_SYMBOL :
182                         return SEPARATOR;
183                     case END_OF_FILE_SYMBOL:
184                         return FINISHED;
185                     case EOL_SYMBOL :
186                         return LINE_FINISHED;
187                     case QUOTE_SYMBOL:
188                         return FIRST_CHAR_INSIDE_QUOTED_FIELD;
189                     default :
190                         return INSIDE_FIELD;
191                 }
192             case OUTSIDE_AFTER_FIELD:
193                 switch (symbol) {
194                     case SPACE_SYMBOL:
195                         return OUTSIDE_AFTER_FIELD;
196                     case SEPARATOR_SYMBOL :
197                         return SEPARATOR;
198                     case END_OF_FILE_SYMBOL:
199                         return FINISHED;
200                     case EOL_SYMBOL :
201                         return LINE_FINISHED;
202                     default :
203                         throw new ParseException(state, symbolCharacter, symbol);
204                 }
205             case INSIDE_FIELD:
206                 switch (symbol) {
207                     case SEPARATOR_SYMBOL :
208                         return SEPARATOR;
209                     case END_OF_FILE_SYMBOL:
210                         return FINISHED;
211                     case EOL_SYMBOL :
212                         return LINE_FINISHED;
213                     case QUOTE_SYMBOL :
214                         throw new ParseException(state, symbolCharacter, symbol);
215                     default :
216                         return INSIDE_FIELD;
217                 }
218             case FIRST_CHAR_INSIDE_QUOTED_FIELD:
219             case INSIDE_QUOTED_FIELD:
220                 switch (symbol) {
221                     case QUOTE_SYMBOL :
222                         return OUTSIDE_AFTER_FIELD;
223                     case ESCAPE_SYMBOL :
224                         return ESCAPING;
225                     case END_OF_FILE_SYMBOL:
226                         throw new ParseException(state, symbolCharacter, symbol);
227                     default :
228                         return INSIDE_QUOTED_FIELD;
229                 }
230             case ESCAPING:
231                 if (symbolMapping.isSameCharactersForEscapeAndQuote()) { // This is the default
232                     switch (symbol) {
233                         case SPACE_SYMBOL:
234                             return OUTSIDE_AFTER_FIELD;
235                         case QUOTE_SYMBOL :
236                             return INSIDE_QUOTED_FIELD;
237                         case EOL_SYMBOL: // Needed when quote/escape are the same: ...abc"\n
238                             return LINE_FINISHED;
239                         case SEPARATOR_SYMBOL : // Needed when quote/escape are the same: ...abc";
240                             return SEPARATOR;
241                         case END_OF_FILE_SYMBOL:
242                             return FINISHED;
243                         default :
244                             throw new ParseException(state, symbolCharacter, symbol);
245                     }
246                 }
247                 // We're lenient -- accept everything
248                 return INSIDE_QUOTED_FIELD;
249             default :
250                 throw new ParseException(state, symbolCharacter, symbol);
251         }
252     }
253 
254     public void setSymbolMapping(SymbolMapping symbolMapping) {
255         this.symbolMapping = symbolMapping;
256     }
257 
258     public SymbolMapping getSymbolMapping() {
259         return this.symbolMapping;
260     }
261 
262 }