Data model generator based on Scala case classes.
- Generate data model (e.g. DDL, avro schema, Elasticsearch mapping) based on Scala case classes
Include dependency:
"com.github.piotr-kalanski" % "data-model-generator_2.11" % "0.7.7"
or
<dependency>
<groupId>com.github.piotr-kalanski</groupId>
<artifactId>data-model-generator_2.11</artifactId>
<version>0.7.7</version>
</dependency>
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object H2Example extends App {
println(DataModelGenerator.generate[Book](dialects.H2Dialect))
}
CREATE TABLE Book(
title VARCHAR,
year INT,
owner OTHER,
authors ARRAY
);
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object HiveExample extends App {
println(DataModelGenerator.generate[Book](dialects.HiveDialect))
}
CREATE TABLE Book(
title STRING,
year INT,
owner STRUCT<name : STRING, age : INT>,
authors ARRAY<STRUCT<name : STRING, age : INT>>
);
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object RedshiftExample extends App {
println(DataModelGenerator.generate[Book](dialects.RedshiftDialect))
}
CREATE TABLE Book(
title VARCHAR,
year INTEGER,
owner VARCHAR,
authors VARCHAR
);
import com.datawizards.dmg.{DataModelGenerator, dialects}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
object MySQLExample extends App {
println(DataModelGenerator.generate[Book](dialects.MySQLDialect))
}
CREATE TABLE Book(
title VARCHAR,
year INTEGER,
owner JSON,
authors JSON
);
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
DataModelGenerator.generate[Book](dialects.AvroSchemaDialect)
{
"namespace": "com.datawizards.dmg.examples",
"type": "record",
"name": "Book",
"fields": [
{"name": "title", "type": "string"},
{"name": "year", "type": "int"},
{"name": "owner", "type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]},
{"name": "authors", "type": "array", "items": {"type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]}}
]
}
case class Person(name: String, age: Int, skills: Seq[String])
DataModelGenerator.generate[Person](dialects.AvroSchemaRegistryDialect)
{"schema":
"{
\"namespace\": \"com.datawizards.dmg.examples\",
\"type\": \"record\",
\"name\": \"Person\",
\"fields\": [
{\"name\": \"name\", \"type\": \"string\"},
{\"name\": \"age\", \"type\": \"int\"},
{\"name\": \"skills\", \"type\": \"array\", \"items\": \"string\"}
]
}"
}
case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])
DataModelGenerator.generate[Book](dialects.ElasticsearchDialect)
{
"mappings" : {
"Book" : {
"properties" : {
"title" : {"type" : "string"},
"year" : {"type" : "integer"},
"owner" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
},
"authors" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
}
}
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.Java)
public class Person {
private String name;
private Integer age;
public Person() {}
public Person(String name, Integer age) {
this.name = name;
this.age = age;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getAge() {
return age;
}
public void setAge(Integer age) {
this.age = age;
}
}
Library enables installing generated data model at target data store e.g. registering generated avro schema at Avro Schema Registry, creating Elasticsearch index or creating Hive table.
import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl
case class Person(name: String, age: Int)
object RegisterAvroSchema extends App {
val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
service.registerSchema[Person]("person")
println("Subjects:")
println(service.subjects())
println("Registered schema:")
println(service.fetchSchema("person"))
}
"Subjects:"
["person"]
"Registered schema:"
{"type":"record","name":"Person","namespace":"com.datawizards.dmg.examples","fields":[{"name":"name","type":"string"},{"name":"age","type":"int"}]}
import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl
case class Person(name: String, age: Int)
object CopyAvroSchemaToHDFS extends App {
val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
service.copyAvroSchemaToHdfs[Person]("/metadata/schemas/person")
}
import com.datawizards.dmg.service.ElasticsearchServiceImpl
case class Person(name: String, age: Int)
object CreateElasticsearchIndex extends App {
val service = new ElasticsearchServiceImpl("http://localhost:9200")
service.createIndex[Person]("person")
println("Index:")
println(service.getIndexSettings("person"))
}
import com.datawizards.dmg.examples.TestModel.PersonWithMultipleEsAnnotations
import com.datawizards.dmg.service.ElasticsearchServiceImpl
object CreateElasticsearchTemplate extends App {
val service = new ElasticsearchServiceImpl("http://localhost:9200")
service.updateTemplate[PersonWithMultipleEsAnnotations]("people")
println("Template:")
println(service.getTemplate("people"))
}
import com.datawizards.dmg.service.HiveServiceImpl
HiveServiceImpl.createHiveTable[Person]()
To extract class metadata you can use method MetaDataWithDialectExtractor.extractClassMetaDataForDialect
. Example:
MetaDataWithDialectExtractor.extractClassMetaDataForDialect[T](Some(dialects.HiveDialect))
import com.datawizards.dmg.annotations._
case class Person(
@column(name="personName")
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE Person(
personName VARCHAR,
age INT
);
import com.datawizards.dmg.annotations._
case class Person(
@column(name="NAME")
@column(name="personName", dialects.ElasticsearchDialect)
name: String,
@column(name="AGE")
@column(name="personAge", dialects.ElasticsearchDialect)
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
NAME VARCHAR,
AGE INT
);
{
"mappings" : {
"person" : {
"personName" : {"type" : "string"},
"personAge" : {"type" : "integer"}
}
}
}
import com.datawizards.dmg.annotations._
@table("PEOPLE")
case class Person(
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
name VARCHAR,
age INT
);
import com.datawizards.dmg.annotations._
@table("PEOPLE")
@table("person", dialects.ElasticsearchDialect)
case class Person(
name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
name VARCHAR,
age INT
);
{
"mappings" : {
"person" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
data-model-generator supports placeholder variables when generating data model. Placeholder variables can be used in any annotation.
Example use case for placeholder variables is to use them for generating table name dependent on environment. For example, each environment has dedicated DB schema e.g. development, uat, production.
@table("${environment}.people")
case class Person(
name: String,
age: Int
)
TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "development"))
CREATE TABLE development.people(
name VARCHAR,
age INT
);
TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "production"))
CREATE TABLE production.people(
name VARCHAR,
age INT
);
@comment("People data")
case class PersonWithComments(
@comment("Person name") name: String,
age: Int
)
DataModelGenerator.generate[PersonWithComments](dialects.H2Dialect)
CREATE TABLE PersonWithComments(
name VARCHAR COMMENT 'Person name',
age INT
);
COMMENT ON TABLE PersonWithComments IS 'People data';
DataModelGenerator.generate[PersonWithComments](dialects.HiveDialect)
CREATE TABLE PersonWithComments(
name STRING COMMENT 'Person name',
age INT
)
COMMENT 'People data';
DataModelGenerator.generate[PersonWithComments](dialects.RedshiftDialect)
CREATE TABLE PersonWithComments(
name VARCHAR,
age INTEGER
);
COMMENT ON TABLE PersonWithComments IS 'People data';
COMMENT ON COLUMN PersonWithComments.name IS 'Person name';
DataModelGenerator.generate[PersonWithComments](dialects.AvroSchemaDialect)
{
"namespace": "com.datawizards.dmg.examples",
"type": "record",
"name": "PersonWithComments",
"doc": "People data",
"fields": [
{"name": "name", "type": "string", "doc": "Person name"},
{"name": "age", "type": "int"}
]
}
import com.datawizards.dmg.annotations._
case class Person(
@length(1000) name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
name VARCHAR(1000),
age INT
);
import com.datawizards.dmg.annotations._
case class Person(
@notNull name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.RedshiftDialect)
DataModelGenerator.generate[Person](dialects.AvroSchemaDialect)
CREATE TABLE PersonWithNull(
name VARCHAR NOT NULL,
age INT
);
CREATE TABLE PersonWithNull(
name VARCHAR NOT NULL,
age INTEGER
);
{
"namespace": "com.datawizards.dmg",
"type": "record",
"name": "PersonWithNull",
"fields": [
{"name": "name", "type": "string"},
{"name": "age", "type": ["null", "int"]}
]
}
Convert table and column names for selected dialect to underscore convention.
@underscore(dialect=dialects.H2Dialect)
case class PersonWithUnderscore(
personName: String,
personAge: Int
)
CREATE TABLE person_with_underscore(
person_name VARCHAR,
person_age INT
);
@hiveExternalTable(location="hdfs:///data/people")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE EXTERNAL TABLE Person(
name STRING,
age INT
)
LOCATION 'hdfs:///data/people';
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe';
@hiveStoredAs(format="PARQUET")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
STORED AS PARQUET;
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
name STRING,
age INT
)
TBLPROPERTIES(
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/person.avro")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.HiveDialect)
If "avro.schema.url" table property is provided then generated data model doesn't have any columns definitions, because they are taken by Hive from avro schema.
CREATE TABLE Person
TBLPROPERTIES(
'avro.schema.url' = 'hdfs:///metadata/person.avro'
);
case class ClicksPartitioned(
time: Timestamp,
event: String,
user: String,
@hivePartitionColumn
year: Int,
@hivePartitionColumn
month: Int,
@hivePartitionColumn
day: Int
)
DataModelGenerator.generate[ClicksPartitioned](dialects.HiveDialect)
CREATE TABLE ClicksPartitioned(
time TIMESTAMP,
event STRING,
user STRING
)
PARTITIONED BY(year INT, month INT, day INT);
case class ClicksPartitioned(
time: Timestamp,
event: String,
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[ClicksPartitionedWithOrder](dialects.HiveDialect)
CREATE TABLE ClicksPartitionedWithOrder(
time TIMESTAMP,
event STRING,
user STRING
)
PARTITIONED BY(year INT, month INT, day INT);
@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveStoredAs(format="PARQUET")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class ParquetTableWithManyAnnotations(
@column("eventTime")
@comment("Event time")
time: Timestamp,
@comment("Event name")
event: String,
@comment("User id")
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[ParquetTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME(
eventTime TIMESTAMP COMMENT 'Event time',
event STRING COMMENT 'Event name',
user STRING COMMENT 'User id'
)
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
STORED AS PARQUET
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
@hiveStoredAs("INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/table.avro")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class AvroTableWithManyAnnotations(
@column("eventTime")
@comment("Event time")
time: Timestamp,
@comment("Event name")
event: String,
@comment("User id")
user: String,
@hivePartitionColumn(order=3)
day: Int,
@hivePartitionColumn(order=1)
year: Int,
@hivePartitionColumn(order=2)
month: Int
)
DataModelGenerator.generate[AvroTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
'avro.schema.url' = 'hdfs:///metadata/table.avro',
'key1' = 'value1',
'key2' = 'value2',
'key3' = 'value3'
);
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
@esSetting("blocks.read_only", true)
@esSetting("codec", "best_compression")
case class Person(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 3,
"blocks.read_only" : "true",
"codec" : "best_compression"
},
"mappings" : {
"Person" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
Index parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-index.html
case class Person(
@esIndex("not_analyzed") name: String,
age: Int
)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"mappings" : {
"PersonEsIndexSettings" : {
"properties" : {
"name" : {"type" : "string", "index" : "not_analyzed"},
"age" : {"type" : "integer"}
}
}
}
}
Date format parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html
case class Person(
name: String,
@esFormat("yyyy-MM-dd") birthday: Date
)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"mappings" : {
"Person" : {
"properties" : {
"name" : {"type" : "string"},
"birthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
}
}
}
}
https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-templates.html
@esTemplate("people*")
case class PersonWithEsTemplate(name: String, age: Int)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
"template" : "people*",
"mappings" : {
"PersonWithEsTemplate" : {
"properties" : {
"name" : {"type" : "string"},
"age" : {"type" : "integer"}
}
}
}
}
@table("people")
@esTemplate("people*")
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
case class PersonWithMultipleEsAnnotations(
@esIndex("not_analyzed")
@column("personName")
name: String,
@column("personBirthday")
@esFormat("yyyy-MM-dd")
birthday: Date
)
DataModelGenerator.generate[PersonWithMultipleEsAnnotations](dialects.ElasticsearchDialect)
{
"template" : "people*",
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 3
},
"mappings" : {
"people" : {
"properties" : {
"personName" : {"type" : "string", "index" : "not_analyzed"},
"personBirthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
}
}
}
}