Cloudera VM standalone cluster learnings - ayushmathur94/Spark GitHub Wiki

1> to access hive using beeline command :

beeline -u jdbc:hive2://localhost:10000
show databases ; 
use `mydb`;
show tables ; 
beeline -u jdbc:hive2://localhost:10000 -f createtableMyTable.hql --verbose true

CREATE EXTERNAL TABLE commondata ( id int, name string, email string, phone string, job_title string, department string, salary double, hire_date string, gender string, address string, city string, country string, postal_code string, start_time string, end_time string, work_location string, department_head string, manager_name string, manager_email string, team_lead string, team_lead_email string, project_name string, project_description string, project_manager string, project_manager_email string, project_start_date string, project_end_date string, technology string, language string, tool string, state string, vendor string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/path/to/data/';

for orc

CREATE EXTERNAL TABLE commondata ( id int, name string, email string, phone string, job_title string, department string, salary double, hire_date string, gender string, address string, city string, country string, postal_code string, start_time string, end_time string, work_location string, department_head string, manager_name string, manager_email string, team_lead string, team_lead_email string, project_name string, project_description string, project_manager string, project_manager_email string, project_start_date string, project_end_date string, technology string, language string, tool string, state string, vendor string ) STORED AS ORC LOCATION '/path/to/data/' TBLPROPERTIES ("orc.compress"="snappy");


from pyspark.sql.functions import lit from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

spark = SparkSession.builder.appName("MyApp")
.config("spark.sql.hive.convertMetastoreOrc", "false")
.enableHiveSupport()
.getOrCreate()

create the schema for the data

schema = StructType([ StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('email', StringType(), True), StructField('phone', StringType(), True), StructField('job_title', StringType(), True), StructField('department', StringType(), True), StructField('salary', DoubleType(), True), StructField('hire_date', StringType(), True), StructField('gender', StringType(), True), StructField('address', StringType(), True), StructField('city', StringType(), True), StructField('country', StringType(), True), StructField('postal_code', StringType(), True), StructField('start_time', StringType(), True), StructField('end_time', StringType(), True), StructField('work_location', StringType(), True), StructField('department_head', StringType(), True), StructField('manager_name', StringType(), True), StructField('manager_email', StringType(), True), StructField('team_lead', StringType(), True), StructField('team_lead_email', StringType(), True), StructField('project_name', StringType(), True), StructField('project_description', StringType(), True), StructField('project_manager', StringType(), True), StructField('project_manager_email', StringType(), True), StructField('project_start_date', StringType(), True), StructField('project_end_date', StringType(), True), StructField('technology', StringType(), True), StructField('language', StringType(), True), StructField('tool', StringType(), True), StructField('state', StringType(), True), StructField('vendor', StringType(), True) ])

Create a DataFrame with the desired columns and data

data = [ (1, "John", "[email protected]", "123-456-7890", "Software Engineer", "IT", 10000.0, "2022-01-01", "Male", "123 Main St", "Bangalore", "India", "560001", "9:00 AM", "5:00 PM", "Bangalore", "Steve Jobs" , "John Doe", "[email protected]", "Jane Smith", "[email protected]", "Project X", "A project about something", "Bob Jones", "[email protected]", "2022-02-01", "2022-06-01", "Java", "English", "Eclipse", "Karnataka", "Samsung"), (2, "Jane", "[email protected]", "234-567-8901", "Data Analyst", "Analytics", 8000.0, "2022-02-01", "Female", "456 1st Ave", "Bangalore", "India", "400001", "9:00 AM", "5:00 PM", "Bangalore", "Steve Jobs", "John Doe", "[email protected]", "Jane Smith", "[email protected]", "Project Y", "A project about something else", "Bob Jones", "[email protected]", "2022-03-01", "2022-07-01", "Python", "Hindi", "Jupyter Notebook", "Maharashtra", "Nokia"), (3, "Bob", "[email protected]", "345-678-9012", "Product Manager", "Product Management", 12000.0, "2022-03-01", "Male", "789 2nd St", "Pune", "India", "110001", "9:00 AM", "5:00 PM", "Bangalore", "Steve Jobs", "John Doe", "[email protected]", "Jane Smith", "[email protected]", "Project Z", "A project about yet another thing", "Bob Jones", "[email protected]", "2022-04-01", "2022-08-01", "Scala", "English", "IntelliJ IDEA", "Delhi", "Cisco"), (4, "Alice", "[email protected]", "456-789-0123", "Database Administrator", "IT", 9000.0, "2022-04-01", "Female", "123 Main St", "Bangalore", "India", "560001", "9:00 AM", "5:00 PM", "Bangalore", "Steve Jobs", "John Doe", "[email protected]", "Jane Smith", "[email protected]", "Project X", "A project about something", "Bob Jones", "[email protected]", "2022-05-01", "2022-09-01", "SQL", "Kannada", "Oracle", "Karnataka", "Samsung"), (5, "Kishore", "[email protected]", "456-789-0123", "Database Administrator", "IT", 9000.0, "2022-04-01", "Female", "123 Main St", "Pune", "India", "560001", "9:00 AM", "5:00 PM", "Bangalore", "Ayush M", "John Doe", "[email protected]", "Jane Smith", "[email protected]", "Project X", "A project about something", "Bob Jones", "[email protected]", "2022-05-01", "2022-09-01", "SQL", "Kannada", "Oracle", "Maharashtra", "Samsung") (6, "Mary Smith", "[email protected]", "123-456-7893", "Project Manager", "Project Management", 120000.0, "2022-01-04", "Female", "321 Pine St", "Jaipur", "India", "600001", "8:00 AM", "4:00 PM", "Bangalore", "Ayush M" , "John Smith", "Jane Doe", "[email protected]", "Mike Johnson", "[email protected]", "Project X", "This is a project description", "John Smith", "[email protected]", "2022-01-01", "2022-12-31", "Java", "Tamil", "Eclipse", "Tamil Nadu", "Samsung") (7, "Mary Williams", "[email protected]", "123-456-7896", "UI Designer", "Design", 90000.0, "2022-01-07", "Female", "567 Pine St", "Jaipur", "India", "600001", "8:00 AM", "4:00 PM", "Bangalore", "Ayush M", "Jane Doe", "[email protected]", "Mike Johnson", "[email protected]", "Project Y", "This is another project description", "John Smith", "[email protected]", "2022-02-01", "2022-11-30", "JavaScript", "Tamil", "Figma", "Tamil Nadu", "Nokia"), (8, "David Lee", "[email protected]", "123-456-7897", "Marketing Coordinator", "Marketing", 70000.0, "2022-01-08", "Male", "234 Elm St", "Pune", "India", "500001", "9:00 AM", "5:00 PM", "Bangalore", "Ayush M", "Alice Smith", "[email protected]", "Mike Johnson", "[email protected]", "Project Z", "This is another project description", "John Smith", "[email protected]", "2022-03-01", "2022-10-31", "Email Marketing", "English", "Mailchimp", "Telangana", "Cisco") ]

df = spark.createDataFrame(data, schema)

write dataframe to orc format with Snappy compression

df.write.format("orc").option("compression", "snappy").partitionBy("state", "city").save("/path/to/output/folder/")


Create a DataFrame with the commondata schema and the desired data

data = [ (5, "Alice Smith", "[email protected]", "123-456-7894", "Marketing Manager", "Marketing", 120000.0, "2022-01-05", "Female", "789 Oak St", "Kolkata", "India", "700001", "9:00 AM", "5:00 PM", "Office", "Bob Johnson", "Alice Smith", "[email protected]", "Mike Johnson", "[email protected]", "Project Z", "This is another project description", "John Smith", "[email protected]", "2022-03-01", "2022-10-31", "SEO", "English", "Google Analytics", "West Bengal", "Cisco"), (6, "Bob Johnson", "[email protected]", "123-456-7895", "Software Developer", "Engineering", 110000.0, "2022-01-06", "Male", "890 Maple St", "Mumbai", "India", "400001", "9:00 AM", "5:00 PM", "Office", "John Smith", "Jane Doe", "[email protected]", "Mike Johnson", "[email protected]", "Project X", "This is a project description", "John Smith", "[email protected]", "2022-01-01", "2022-12-31", "Python", "English", "Visual Studio Code", "Maharashtra", "Samsung"), (7, "Mary Williams", "[email protected]", "123-456-7896", "UI Designer", "Design", 90000.0, "2022-01-07", "Female", "567 Pine St", "Chennai", "India", "600001", "8:00 AM", "4:00 PM", "Remote", "John Smith", "Jane Doe", "[email protected]", "Mike Johnson", "[email protected]", "Project Y", "This is another project description", "John Smith", "[email protected]", "2022-02-01", "2022-11-30", "JavaScript", "Tamil", "Figma", "Tamil Nadu", "Nokia"), (8, "David Lee", "[email protected]", "123-456-7897", "Marketing Coordinator", "Marketing", 70000.0, "2022-01-08", "Male", "234 Elm St", "Hyderabad", "India", "500001", "9:00 AM", "5:00 PM", "Office", "Bob Johnson", "Alice Smith", "[email protected]", "Mike Johnson", "[email protected]", "Project Z", "This is another project description", "John Smith", "[email protected]", "2022-03-01", "2022-10-31", "Email Marketing", "English", "Mailchimp", "Telangana", "Cisco"), (9, "Karen Jones", "[email protected]", "123-456-7898", "Software Engineer", "Engineering", 95000.0, "2022-01-09", "Female", "456 Oak St", "Bengaluru", "India", "560001", "9:00 AM", "5:00


Check this error :

23/05/05 20:27:53 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all 23/05/05 20:27:54 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all

yarn.ApplicationMaster: Final app status: FAILED, exitCode: 1, (reason: User application exited with status 1)


sudo -u hdfs hdfs dfs -chown -R osboxes:hive /customhivedatabase/organization

sudo -u hdfs hdfs dfs -chmod -R 777 /customhivedatabase/organization


mysql huedb

to start mysql client on terminal : mysql -h quickstart-bigdata -u root -p

mysql> create database hue_db; Query OK, 1 row affected (0.03 sec)

mysql> create user 'hue_user'@'quickstart-bigdata' identified by 'hueadmin' ; Query OK, 0 rows affected (0.09 sec)

mysql> grant all privileges on hue_db.* to 'hue_user'@'quickstart-bigdata' ; Query OK, 0 rows affected (0.04 sec)

mysql> commit; Query OK, 0 rows affected (0.00 sec)


⚠️ **GitHub.com Fallback** ⚠️