[email protected] www.rittmanmead.com @rittmanmead 34
•Hive SELECT statement against nested columns returns data as arrays
•Can parse programatically, or create further views or CTAS tables to split out array
Support for Nested (Array)-Type Structures
hive> select entities, user from tweets
> limit 3;
OK
{"urls":[{"expanded_url":"http://www.rittmanmead.com/
biforum2013"}],"user_mentions":[],"hashtags":[]}
{"screen_name":"markrittman","name":"Mark
Rittman","friends_count":null,"followers_count":null,"statuses_count":null,"ver
ified":false,"utc_offset":null,"time_zone":null}
{"urls":[{"expanded_url":"http://www.bbc.co.uk/news/
technology-22299503"}],"user_mentions":[],"hashtags":[]}
{"screen_name":"markrittman","name":"Mark
Rittman","friends_count":null,"followers_count":null,"statuses_count":null,"ver
ified":false,"utc_offset":null,"time_zone":null}
{"urls":[{"expanded_url":"http://pocket.co/seb2e"}],"user_mentions":
[{"screen_name":"ArtOfBI","name":"Christian Screen"},
{"screen_name":"wiseanalytics","name":"Lyndsay Wise"}],"hashtags":[]}
{"screen_name":"markrittman","name":"Mark
Rittman","friends_count":null,"followers_count":null,"statuses_count":null,"ver
ified":false,"utc_offset":null,"time_zone":null}
How to you work with these values?
CREATE TABLE tweets_expanded
stored as parquet
AS select
tweets.id,
tweets.created_at,
tweets.user.screen_name as user_screen_name,
tweets.user.friends_count as user_friends_count,
tweets.user.followers_count as user_followers_count,
tweets.user.statuses_count as user_tweets_count,
tweets.text,
tweets.in_reply_to_screen_name,
tweets.retweeted_status.user.screen_name as retweet_user_screen_name,
tweets.retweeted_status.retweet_count as retweet_count,
tweets.entities.urls[0].expanded_url as url1,
tweets.entities.urls[1].expanded_url as url2,
tweets.entities.hashtags[0].text as hashtag1,
tweets.entities.hashtags[1].text as hashtag2,
tweets.entities.hashtags[2].text as hashtag3,
tweets.entities.hashtags[3].text as hashtag4
from tweets;
Create a copy of the table in Parquet storage format
“Denormalize” the array by selecting individual elements
CREATE view tweets_expanded_view
AS select
tweets.id,
tweets.created_at,
tweets.user.screen_name as user_screen_name,
tweets.user.friends_count as user_friends_count,
tweets.user.followers_count as user_followers_count,
tweets.user.statuses_count as user_tweets_count,
tweets.text,
tweets.in_reply_to_screen_name,
tweets.retweeted_status.user.screen_name as retweet_user_screen_name,
tweets.retweeted_status.retweet_count as retweet_count,
tweets.entities.urls[0].expanded_url as url1,
tweets.entities.urls[1].expanded_url as url2,
tweets.entities.hashtags[0].text as hashtag1,
tweets.entities.hashtags[1].text as hashtag2,
tweets.entities.hashtags[2].text as hashtag3,
tweets.entities.hashtags[3].text as hashtag4
from tweets;
… or create as a view (not all BI tools support views though)