06 hive 面试题

数据准备

建表

create table dept (deptid int ,deptname string,address string);
create table emp(empid int ,empname string,salary DECIMAL ,ruzhidate date,mgr int,deptid int);

插入数据

insert into dept values(1,'研发部','北京'),(2,'人事部','上海'),(3,'销售部','深圳'),(4,'公关部','东莞');
insert into emp values(1001,'tom',18000,'2013-10-11',1005,1),(1002,'jerry',13000,'2021-11-11',1005,1),(1003,'jack',10000,'2020-09-11',1001,1),(1004,'rose',5000,'2020-10-11',1001,2),(1005,'bob',20000,'2018-08-11',null,2)

面试题1：统计全公司工资最高的三个人
```
select * from emp sort by salary desc limit 3 
```

面试题2：查出部门的员工数和部门的名称

select d.deptname,count(*) from
dept d join emp e on d.deptid=e.deptid
group by d.deptname

面试题3：统计员工最高的工资是多少，以及这个人是谁

方式一：

select e.empname,e.salary from 
emp e
join
(select max(salary) as maxsalary from emp) t1
on e.salary=t1.maxsalary;

方式二：

select * from emp where salary =(select max(salary) from emp);

面试题4：每个部门最高的工资，以及是谁

select e.empname,e.salary,e.deptid from 
emp  e
join
(select max(salary) as maxsalary ,deptid from emp group by deptid) t1
on e.deptid=t1.deptid and e.salary=t1.maxsalary

面试题5：部门平均工资大于公司总的平均工资的部门是哪个部门

select t1.avg ,t1.deptid from 
(select avg(salary) as avg ,deptid,'link' as link from emp group by deptid) t1
join
(select avg(salary) as totalavg ,'link' as link from emp ) t2
on t1.link=t2.link where  t1.avg>t2.totalavg;

面试题6：求哪个员工的工资大于本部门的平均工资

select * from 
(select avg(salary) as avg ,deptid from emp group by deptid) t1
join
emp t2
on t1.deptid=t2.deptid where t2.salary>t1.avg;

数据准备
现有用户请求数据
用户id（uid） string
请求时间(req_time) string
请求url(req_url) string
处理日期(day) string

处理日期：20220919

uid	req_time	req_url
u0001	20200918102030	/index.jsp
u0001	20200918102035	/goods.action
u0002	20200918102030	/brand.action
u0001	20200918102130	/index.jsp
u0002	20200918102230	/index.jsp

处理日期：20220920

uid	req_time	req_url
u0001	20200919102030	/index.jsp
u0003	20200919102035	/goods.action
u0004	20200919102030	/brand.action
u0001	20200919102130	/index.jsp
u0003	20200919102230	/index.jsp

用户数据如下：

用户id(uid)
用户名称(uname)

uid	Name
u0001	用户1
U0002	用户2
U0003	用户3

面试题7：根据给的用户请求数据，创建textfile表分区表hainiu.request_text ，创建每天处理日期的分区，并导入数据

create table request_text(uid string,req_time string ,req_url string) partitioned by (day string) row format delim
ited fields terminated by '\t';
load data local inpath '/home/hadoop/19' into table request_text partition(day='20220919');
load data local inpath '/home/hadoop/20' into table request_text partition(day='20220920');

面试题8：创建parquet分区表 hainiu.request_p ，创建每天处理日期的分区，并将hainiu.request_text 表每天分区数据导入到 request_p 每天分区中
```
create table request_p (uid string,req_time string ,req_url string) partitioned by (day string) stored as parquet;
```

普通方式

insert into table request_p partition(day='20220919') select * from request_text where day='20220919';
insert into table request_p partition(day='20220920') select * from request_text where day='20220920';

动态分区方式

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;  
insert into table request_p partition(day) select * from request_text;

数据准备
创建用户表 user_text表存储格式是字段两个，数据用已经提供的数据
u0001 用户1
u0002 用户2
u0003 用户3

面试题9：创建parquet分区表 request_p2, 将用户信息表和request_p表中的数据导入到request_p2表中，格式要符合要求
表结构：
用户id
用户名称
请求日期需要格式化只保留 20200919 年月日
请求url
处理日期

create table request_p2(uid string,uname string,req_day string,req_url string) partitioned by (day string) stored as parquet;

动态分区的方式进行查询导入

insert into table request_p2 partition(day)
select t1.uid,t2.uname,t1.req_day,t1.req_url,t1.day from 
(select uid,substring(req_time,1,8) as req_day,req_url,day from request_p) t1
join
user_text t2 on t1.uid=t2.uid;

面试题10：用 request_p2 表统计请求日期是20220918 ,处理日期是 20220919 请求的用户数（去重），导出到linux /home/hadoop/export/output1.txt文件中

方式1：

insert overwrite directory '/home/hadoop/export/' select count(distinct(uid)) from request_p2 where req_day='20220918' and day='20220919';
qqq

方式2：

hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select count(distinct(uid)) from request_p2 where req_day='20200918' and day='20220919';" > /home/hadoop/export/output1.txt

面试题11：用 request_p2 表查询出每个请求日期每个用户的访问量，导出到linux /home/hadoop/export/output2.txt文件中

hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select req_day,uid ,count(*) from request_p2 group by req_day,uid;" > /home/hadoop/export/output2.txt

面试题12：用request_p2 表计算20号比19号的增量用户，导出到linux /home/hadoop/export/output3.txt文件中

hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select t2.uid from (select uid from request_p2 where day='20220919' group by uid) t1 right join (select uid from request_p2 where day='20220920' group by uid) t2 on t1.uid=t2.uid where t1.uid is null;" > /home/hadoop/export/output3.txt

面试题13：查询request_p 处理日期是 20220919-20220920，每个用户的最后请求记录导出到linux /home/hadoop/export/output4.txt文件中

hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select * from (select uid,req_time,req_url,row_number() over(partition by uid order by req_time desc ) as lastreq  from request_p) t1 where lastreq=1; " > /home/hadoop/export/output4.txt

作者：薪牛

薪牛的其他话题

分类下其他主题

随机推荐话题

06 hive 面试题

添加附言

作者：薪牛

薪牛 的其他话题

分类下其他主题

随机推荐话题

薪牛的其他话题