impala 实操

impala操作环境

impala-shell

kinit -kt /data/impala.keytab impala
klist
impala-shell

file

# 连接impala时指定impalad，-i参数指定impalad节点(可以是任意节点)，-k参数是采用kerberos认证方式登录
impala-shell -i worker-2 -k

file

实际生产环境下-i参数至关重要，如果有多个并行任务，可以通过-i参数将任务均衡分发到不同的impalad节点上

hue操作impala
- hue创建impala用户

file

使用impala用户登录

file

  -- 执行测试
  select * from test;
  create table test2(id string);

在安全环境下需要kerberos安全认证与sentry授权，认证与授权参照kerberos+sentry实操

impala-shell实操

impala-shell实操与hue实操命令一致。

impala 基本语句与hive基本一致。

创建测试表并加载数据

-- 创建临时表
create table if not exists xinniu.tmp1(
    pk string,
    col1 int,
    col2 boolean,
    col3 timestamp,
    col4 varchar
)
comment '临时加载表'
row format delimited fields terminated by '\t'
;
-- 加载数据
load data inpath '/user/impala/testfile' into table xinniu.tmp1;
-- 创建parquet正式表
create table if not exists xinniu.hainiu1
stored as parquet 
tblproperties ("parquet.compress"="SNAPPY")
as 
select * from xinniu.tmp1
where 1=0
;
-- 临时表加载数据到正式表
insert into table xinniu.hainiu1
select * from xinniu.tmp1;

file

数据导出

impala不支持insert overwrite 的方式导出数据，需要使用-o outputpath的方式导出。

# 参数解释:-i 指定impalad节点 -k 使用kerberos认证方式 -q 查询语句(也可以使用-f sql文件的方式) -B --output_delimiter 指定输出文件分隔符 -o 输出文件
impala-shell -i worker-1 -k -q "select * from xinniu.hainiu1 limit 200;" -B --output_delimiter="\t" -o /data/output.txt

file

查询语句

查询语句与hive基本一致

-- 在排序语句中使用offset 即从offset位置开始输出
select * from xinniu.hainiu1 order by pk desc limit 10 offset 2;

file

union与union all

-- union去重 union all不去重

-- 查询hainiu1表，创建并插入到hainiu2表中
create table if not exists xinniu.hainiu2 
as
select * from xinniu.hainiu1;

-- union
create table if not exists xinniu.uniontable
as
select * from xinniu.hainiu1
union 
select * from xinniu.hainiu2;
-- 验证uniontable表数量
select count(1) from xinniu.uniontable;
-- union all
create table if not exists xinniu.unionalltable
as
select * from xinniu.hainiu1
union all
select * from xinniu.hainiu2;

file

日期类型

-- hive parquet格式不支持date类型，orc格式支持date类型
create table if not exists xinniu.testdatetype4hive(
    pk string,
    col2 date
)
comment 'test date type'
stored as orc
tblproperties ("orc.compress"="SNAPPY");
-- impala impala不支持date类型，什么格式都不支持date类型,统一使用timestamp
create table if not exists testdatetype4impala(
    pk string,
    col2 timestamp
)
comment 'test date type'
stored as parquet
tblproperties ("parquet.compress"="SNAPPY");
-- 时间转换
select current_timestamp();
select from_unixtime(unix_timestamp(current_timestamp()));
select unix_timestamp(current_timestamp());
select typeof(unix_timestamp(current_timestamp()));
select typeof("yyyy-MM-dd HH");
select from_timestamp(cast(unix_timestamp(current_timestamp()) as timestamp),"yyyy-MM-dd");

file

转码函数

-- 加密
select base64encode('hainiu');
-- 解密
select base64decode('aGFpbml1');

file

拼接字符串

-- concat和hive没有区别，但是concat_ws没有hive强大，在impala中concat_ws(),有null出现则结果就为null
select concat('hello','hainiu');
select concat_ws('_','hello','hainiu');
select concat('hello','hainiu',null);
select concat_ws('_','hello','hainiu',null);

file

字符串查找

-- 查找bc在abcdefg中第一次出现的位置(以1起点)
select instr('abcdefg','bc');

file

元数据同步

-- 在impala中执行ddl语句之后，并非所有impalad都会立刻感知到，有三种方式可以解决：
-- 在ddl语句前开启sync_ddl参数，在ddl语句结束后关闭，当前session有效，优先使用
-- 使用refresh db.tablename 表级增量刷新，第二推荐
-- 使用invalidate metadata 全量刷新，使所有impalad上缓存的元数据无效，尽量少用或者不用，一般生产上不允许使用，即使非要用也是invalidate metadata tablename的方式使用
set SYNC_DDL=true;
create table xinniu.synctable(id string);
set SYNC_DDL=false;
refresh xinniu.synctable;
invalidate metadata xinniu.synctable;

file

解决中文注释乱码问题

-- 进入mysql元库查看建表语句
show create table metastore;
-- 修改数据编码为latin1
alter database metastore default character set latin1;
-- 修改表、列、分区、分区键、索引编码
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
alter table PARTITION_PARAMS  modify column PARAM_VALUE varchar(4000) character set utf8;
alter table PARTITION_KEYS  modify column PKEY_COMMENT varchar(4000) character set utf8;
alter table  INDEX_PARAMS  modify column PARAM_VALUE  varchar(4000) character set utf8;

file

impala性能相关

impala关联操作左大右小，如果没有按照左大右小的规则写，impala会按照表统计信息进行优化，但是这么做是依赖历史表统计信息进行优化的，所以在遇到性能非常差的场景时可以从新刷一下表统计信息。

表统计信息的目的是叫plainner知道每张表的一些统计信息，如表大小，表结构等，便于在生成执行计划树是准确的并行下发执行计划树。

-- 刷新全表统计信息
compute stats db.tablename;
-- 执行完dml语句时针对分区刷新表统计信息,此种方式只刷新了batch_date这个新增分区的信息到表统计信息中，相比compute stats效率要快
compute incremental stats db.tablename partition (pt in ($batch_date,''));

在执行大批量任务的时候通常会有非常多任务脚本，在任务提交的时候应该轮询发送到不同的imapad节点上进行，而不应该在同一个impalad节点上执行，会导致单一impalad节点的oom，也严重影响并发性能，在执行的过程中通过-i指定impalad的节点。

impala-shell -k -i worker-1(impalad节点) -q '查询语句' 或者 -f 'sql文件'

impala执行计划解读

判断是否有表或者列统计信息

explain select straight_join t1.*  from (SELECT pk,col1,col2 from xinniu.hainiu1) t1
join
(select pk,col1,col2 from xinniu.hainiu2 limit 10) t2
on t1.pk=t2.pk;

file

在hue上可以查看执行计划树，并且可以直观的看到每个环节所用的时间。

file

创建测试表并加载数据

数据导出

查询语句

union与union all

日期类型

转码函数

拼接字符串

字符串查找

元数据同步

解决中文注释乱码问题

判断是否有表或者列统计信息

作者：青牛

青牛的其他话题

分类下其他主题

随机推荐话题

impala 实操

创建测试表并加载数据

数据导出

查询语句

union与union all

日期类型

转码函数

拼接字符串

字符串查找

元数据同步

解决中文注释乱码问题

判断是否有表或者列统计信息

添加附言

作者：青牛

青牛 的其他话题

分类下其他主题

随机推荐话题

青牛的其他话题