302 lines
8.5 KiB
Scala
302 lines
8.5 KiB
Scala
/*
|
|
* Copyright 2013 Toshiyuki Takahashi
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
package com.github.tototoshi.csv
|
|
|
|
import scala.annotation.switch
|
|
|
|
object CSVParser {
|
|
|
|
class MalformedCSVException(message: String) extends Exception(message)
|
|
|
|
private type State = Int
|
|
final private val Start = 0
|
|
final private val Field = 1
|
|
final private val Delimiter = 2
|
|
final private val End = 3
|
|
final private val QuoteStart = 4
|
|
final private val QuoteEnd = 5
|
|
final private val QuotedField = 6
|
|
|
|
def apply(input: String, escapeChar: Char, delimiter: Char, quoteChar: Char): Option[List[String]] = {
|
|
val buf: Array[Char] = input.toCharArray
|
|
var fields: Vector[String] = Vector()
|
|
var field = new StringBuilder
|
|
var state: State = Start
|
|
var pos = 0
|
|
val buflen = buf.length
|
|
|
|
if (buf.length > 0 && buf(0) == '\uFEFF') {
|
|
pos += 1
|
|
}
|
|
|
|
while (state != End && pos < buflen) {
|
|
val c = buf(pos)
|
|
(state: @switch) match {
|
|
case Start => {
|
|
c match {
|
|
case `quoteChar` => {
|
|
state = QuoteStart
|
|
pos += 1
|
|
}
|
|
case `delimiter` => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = Delimiter
|
|
pos += 1
|
|
}
|
|
case '\n' | '\u2028' | '\u2029' | '\u0085' => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case '\r' => {
|
|
if (pos + 1 < buflen && buf(1) == '\n') {
|
|
pos += 1
|
|
}
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case x => {
|
|
field += x
|
|
state = Field
|
|
pos += 1
|
|
}
|
|
}
|
|
}
|
|
case Delimiter => {
|
|
c match {
|
|
case `quoteChar` => {
|
|
state = QuoteStart
|
|
pos += 1
|
|
}
|
|
case `escapeChar` => {
|
|
if (
|
|
pos + 1 < buflen
|
|
&& (buf(pos + 1) == escapeChar || buf(pos + 1) == delimiter)
|
|
) {
|
|
field += buf(pos + 1)
|
|
state = Field
|
|
pos += 2
|
|
} else {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
}
|
|
case `delimiter` => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = Delimiter
|
|
pos += 1
|
|
}
|
|
case '\n' | '\u2028' | '\u2029' | '\u0085' => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case '\r' => {
|
|
if (pos + 1 < buflen && buf(1) == '\n') {
|
|
pos += 1
|
|
}
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case x => {
|
|
field += x
|
|
state = Field
|
|
pos += 1
|
|
}
|
|
}
|
|
}
|
|
case Field => {
|
|
c match {
|
|
case `escapeChar` => {
|
|
if (pos + 1 < buflen) {
|
|
if (
|
|
buf(pos + 1) == escapeChar
|
|
|| buf(pos + 1) == delimiter
|
|
) {
|
|
field += buf(pos + 1)
|
|
state = Field
|
|
pos += 2
|
|
} else {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
} else {
|
|
state = QuoteEnd
|
|
pos += 1
|
|
}
|
|
}
|
|
case `delimiter` => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = Delimiter
|
|
pos += 1
|
|
}
|
|
case '\n' | '\u2028' | '\u2029' | '\u0085' => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case '\r' => {
|
|
if (pos + 1 < buflen && buf(1) == '\n') {
|
|
pos += 1
|
|
}
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case x => {
|
|
field += x
|
|
state = Field
|
|
pos += 1
|
|
}
|
|
}
|
|
}
|
|
case QuoteStart => {
|
|
c match {
|
|
case `escapeChar` if escapeChar != quoteChar => {
|
|
if (pos + 1 < buflen) {
|
|
if (
|
|
buf(pos + 1) == escapeChar
|
|
|| buf(pos + 1) == quoteChar
|
|
) {
|
|
field += buf(pos + 1)
|
|
state = QuotedField
|
|
pos += 2
|
|
} else {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
} else {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
}
|
|
case `quoteChar` => {
|
|
if (pos + 1 < buflen && buf(pos + 1) == quoteChar) {
|
|
field += quoteChar
|
|
state = QuotedField
|
|
pos += 2
|
|
} else {
|
|
state = QuoteEnd
|
|
pos += 1
|
|
}
|
|
}
|
|
case x => {
|
|
field += x
|
|
state = QuotedField
|
|
pos += 1
|
|
}
|
|
}
|
|
}
|
|
case QuoteEnd => {
|
|
c match {
|
|
case `delimiter` => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = Delimiter
|
|
pos += 1
|
|
}
|
|
case '\n' | '\u2028' | '\u2029' | '\u0085' => {
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case '\r' => {
|
|
if (pos + 1 < buflen && buf(1) == '\n') {
|
|
pos += 1
|
|
}
|
|
fields :+= field.toString
|
|
field = new StringBuilder
|
|
state = End
|
|
pos += 1
|
|
}
|
|
case _ => {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
}
|
|
}
|
|
case QuotedField => {
|
|
c match {
|
|
case `escapeChar` if escapeChar != quoteChar => {
|
|
if (pos + 1 < buflen) {
|
|
if (
|
|
buf(pos + 1) == escapeChar
|
|
|| buf(pos + 1) == quoteChar
|
|
) {
|
|
field += buf(pos + 1)
|
|
state = QuotedField
|
|
pos += 2
|
|
} else {
|
|
field += buf(pos)
|
|
field += buf(pos + 1)
|
|
state = QuotedField
|
|
pos += 2
|
|
}
|
|
} else {
|
|
throw new MalformedCSVException(buf.mkString)
|
|
}
|
|
}
|
|
case `quoteChar` => {
|
|
if (pos + 1 < buflen && buf(pos + 1) == quoteChar) {
|
|
field += quoteChar
|
|
state = QuotedField
|
|
pos += 2
|
|
} else {
|
|
state = QuoteEnd
|
|
pos += 1
|
|
}
|
|
}
|
|
case x => {
|
|
field += x
|
|
state = QuotedField
|
|
pos += 1
|
|
}
|
|
}
|
|
}
|
|
case End => {
|
|
sys.error("unexpected error")
|
|
}
|
|
}
|
|
}
|
|
(state: @switch) match {
|
|
case Delimiter => {
|
|
fields :+= ""
|
|
Some(fields.toList)
|
|
}
|
|
case QuotedField => {
|
|
None
|
|
}
|
|
case _ => {
|
|
// When no crlf at end of file
|
|
state match {
|
|
case Field | QuoteEnd => {
|
|
fields :+= field.toString
|
|
}
|
|
case _ => {}
|
|
}
|
|
Some(fields.toList)
|
|
}
|
|
}
|
|
}
|
|
}
|