Validate csv files agaists a schema
Supports Scala 3 and ZIO 2
libraryDependencies += "com.bilal-fazlani" %% "csv-schema" % "<VERSION>"
Schema file
columns:
- string-schema:
column-name: name
max-length: 100
min-length: 3
regex: "[a-ZA-Z]*"
- string-schema:
column-name: city
allowed-values:
- Mumbai
- Pune
- Delhi
- boolean-schema:
column-name: selfEmployed
- integer-schema:
column-name: age
min: 10
max: 100
required: false
- double-schema:
column-name: salary
min: 999.00534
max: 80000
required: false
CSV file
name, city, selfEmployed, age
,,,
as, Tokyo, dddd, 150
john,
ab cd, Delhi, false, 2
asasd, Mumbai, true, 20
import zio.*
import com.bilalfazlani.csvSchema.*
import zio.nio.file.Path
import zio.Console.*
object ExampleApp extends ZIOAppDefault {
val program = for {
schema <- CsvSchema.File(Path("./example/test.schema.yml")).load
_ <- CsvValidation.validate(schema, Path("./example/data-invalid.csv"))
} yield ()
def run =
program
.provideSome[Scope](CsvValidation.live)
.tapError(e => printLineError(e.toString))
.zipRight(printLine("Valid data"))
.exitCode
}
You can also create schema programatically inline
val schema: CsvSchema = ColumnSchema.StringSchema(
columnName = "name",
maxLength = Some(100),
minLength = Some(3),
regex = Some("[a-zA-Z]*".r)
) &
ColumnSchema.StringSchema(
columnName = "city",
allowedValues = Set(
"Mumbai",
"Pune",
"Delhi"
)
) &
ColumnSchema.BooleanSchema(
columnName = "selfEmployed"
) &
ColumnSchema.IntegerSchema(
columnName = "age",
min = Some(10),
max = Some(100),
required = false
) &
ColumnSchema.DoubleSchema(
columnName = "salary",
min = Some(1000.0002),
max = Some(8000.999),
required = false
)
Error reporting
validation failed at ./test.csv:2
- missing value for `name`
- missing value for `city`
- missing value for `selfEmployed`
syntax error at ./test.csv:3
- blank line
validation failed at ./test.csv:4
- name 'as' is of length 2. expected length to be >= 3
- city 'Tokyo' is invalid. valid values are: Mumbai, Pune, Delhi
- selfEmployed 'dddd' is not a valid Boolean
- age '150' should be <= 100
syntax error at ./test.csv:5
- 2 value(s) found. expected number of values: 4
validation failed at ./test.csv:6
- name 'ab cd' did not match regex: [a-zA-Z]*
- age '2' should be >= 10