Big Data

Get Involved. Join the Conversation.

Topic

    candan pehlivanoglu
    How should we gather external table stats on bda via bigdata...
    Topic posted January 2, 2019 by candan pehlivanogluGreen Ribbon: 100+ Points, tagged Analytics, Big Data Appliance, Big Data SQL, Data Management, Hadoop 
    229 Views, 1 Comment
    Title:
    How should we gather external table stats on bda via bigdata sql.
    Summary:
    Gather stats for external tables.
    Content:

    Hello all;

    We use bds for our external tables. We have daily jobs  to ingest these tables on big data and nealry 200 tables. These tables are partitioned on hive. But, on oracle site, since there isn't any partition column, we should gather all statistic every day. It is taking too much time. Do you have any other method for this operation. Since we use these tables on oracle, Oracle should be able to know statistic information.

     

    thank you.

     

    Comment

     

    • Alexey Filanovskiy

      thank you for a good question.

      Oracle RDBMS 12.2 allows you to create partitioned external table and you could use this feature to achieve your goal.

      let me show how you could have incremental statistics for partitioned Hive table with Big Data SQL.

      Note1: this example based on the TCP/DS dataset.

      Note2: for simplify explanation I've used in the beginning of scripts, listed below "hive" for statements which should be run in hive and "SQL" for Oracle RDBMS part.

      1) create partitioned table in Hive:

      ----------------------------------------------------------------------

      hive> drop table parq.store_sales_part;

       CREATE TABLE parq.store_sales_part(

        ss_sold_date_sk bigint, 

        ss_sold_time_sk bigint, 

        ss_item_sk bigint, 

        ss_customer_sk bigint, 

        ss_cdemo_sk bigint,

        ss_hdemo_sk bigint, 

        ss_addr_sk bigint, 

        ss_store_sk bigint,

        ss_promo_sk bigint, 

        ss_ticket_number bigint,

        ss_quantity int, 

        ss_wholesale_cost decimal(7,2), 

        ss_list_price decimal(7,2),

        ss_sales_price decimal(7,2), 

        ss_ext_discount_amt decimal(7,2),

        ss_ext_sales_price decimal(7,2),

        ss_ext_wholesale_cost decimal(7,2), 

        ss_ext_list_price decimal(7,2),

        ss_ext_tax decimal(7,2), 

        ss_coupon_amt decimal(7,2),

        ss_net_paid decimal(7,2), 

        ss_net_paid_inc_tax decimal(7,2),

        ss_net_profit decimal(7,2))

        partitioned by (

        year INT,

        month INT,

        day INT)

      stored as parquetfile;

      ----------------------------------------------------------------------

      2) load some data to this table (into 2 partitions):

      ----------------------------------------------------------------------

      hive> SET hive.exec.dynamic.partition=true;

       SET hive.exec.dynamic.partition.mode=nonstrict;

       SET hive.exec.max.dynamic.partitions =10000;

       INSERT INTO TABLE parq.store_sales_part PARTITION (year, month, day)

       SELECT store_sales.*, dt.d_year, dt.d_moy, dt.d_dom

      FROM  parq.store_sales, parq.date_dim dt

      WHERE

      dt.d_date_sk = store_sales.ss_sold_date_sk

      and d_year=2003

      and d_moy=01

      and (d_dom=01 or d_dom=02);

      ----------------------------------------------------------------------

      3) Obtain DDL for External table and Create external table in Oracle RDBMS:

      ----------------------------------------------------------------------

      SQL> set serveroutput on;

      DECLARE

         DDLtxt CLOB;

      BEGIN

         dbms_hadoop.create_extddl_for_hive(

           CLUSTER_ID=>'pmcluster',

           DB_NAME=>'parq',

           HIVE_TABLE_NAME=>'store_sales_part',

           HIVE_PARTITION=>TRUE,

           TABLE_NAME=>'ora_store_sales_part',

           PERFORM_DDL=>FALSE,

           TEXT_OF_DDL=>DDLtxt

         );

         dbms_output.put_line(DDLtxt);

      END;

      /

       

      SQL> CREATE TABLE "BDSQL"."ORA_STORE_SALES_PART" (ss_sold_date_sk NUMBER(19), ss_sold_time_sk NUMBER(19), ss_item_sk NUMBER(19), ss_customer_sk NUMBER(19), ss_cdemo_sk NUMBER(19), ss_hdemo_sk NUMBER(19), ss_addr_sk NUMBER(19), ss_store_sk NUMBER(19), ss_promo_sk NUMBER(19), ss_ticket_number NUMBER(19), ss_quantity NUMBER(10), ss_wholesale_cost NUMBER(7,2), ss_list_price NUMBER(7,2), ss_sales_price NUMBER(7,2), ss_ext_discount_amt NUMBER(7,2), ss_ext_sales_price NUMBER(7,2), ss_ext_wholesale_cost NUMBER(7,2), ss_ext_list_price NUMBER(7,2), ss_ext_tax NUMBER(7,2), ss_coupon_amt NUMBER(7,2),

      ss_net_paid NUMBER(7,2), ss_net_paid_inc_tax NUMBER(7,2), ss_net_profit NUMBER(7,2), year NUMBER(10), month NUMBER(10), day NUMBER(10))  ORGANIZATION EXTERNAL

       (TYPE ORACLE_HIVE

       DEFAULT DIRECTORY DEFAULT_DIR

       ACCESS PARAMETERS (

      com.oracle.bigdata.cluster=pmcluster

      com.oracle.bigdata.tablename=parq.store_sales_part)

      ) REJECT LIMIT UNLIMITED

      PARTITION BY LIST (year, month, day)

      (

      PARTITION "P_484070065" VALUES (2003, 1, 1),

      PARTITION "P_288436660" VALUES (2003, 1, 2)

      )

      ----------------------------------------------------------------------
      4) Gather Statistics for this newly created table:
      ----------------------------------------------------------------------

      SQL> begin

          DBMS_STATS.GATHER_TABLE_STATS (ownname => 'BDSQL',

                                         tabname => 'ORA_STORE_SALES_PART');

      end;

      ----------------------------------------------------------------------
      5) Verify gathering statistics:
      ----------------------------------------------------------------------

      SQL> select PARTITION_NAME, LAST_ANALYZED from ALL_TAB_STATISTICS

      where

      owner=upper('BDSQL')

      and table_name='ORA_STORE_SALES_PART';

       

      PARTITION_NAME                                                             LAST_ANALYZED

      ------------------------------------------------------------------------ -------------------

      P_288436660                                                             02-05-2019 00:13:40

      P_484070065                                                             02-05-2019 00:13:40

      ----------------------------------------------------------------------

      6) set up property for using incremental statistics(for more details check here):

      ----------------------------------------------------------------------

      SQL> begin

      DBMS_STATS.SET_TABLE_PREFS('BDSQL','ORA_STORE_SALES_PART','INCREMENTAL','TRUE');

      end;

      ----------------------------------------------------------------------

      7) add a partition to hive table:

      ----------------------------------------------------------------------

      hive> SET hive.exec.dynamic.partition=true;

       SET hive.exec.dynamic.partition.mode=nonstrict;

       SET hive.exec.max.dynamic.partitions =10000;

       INSERT INTO TABLE parq.store_sales_part PARTITION (year, month, day)

       SELECT store_sales.*, dt.d_year, dt.d_moy, dt.d_dom

       FROM  parq.store_sales, parq.date_dim dt

      WHERE

      dt.d_date_sk = store_sales.ss_sold_date_sk

      and d_year=2003

      and d_moy=01

      and d_dom=03;

      ----------------------------------------------------------------------

      8) add corresponding partition to the Oracle table:

      ----------------------------------------------------------------------

      SQL> alter table BDSQL.ORA_STORE_SALES_PART add partition "NEW_PARTITION2" VALUES (2003, 1, 4);

      ----------------------------------------------------------------------
      9) make sure that there is no statistics for this newly added partition:
      ----------------------------------------------------------------------

      SQL> select PARTITION_NAME, LAST_ANALYZED from ALL_TAB_STATISTICS

      where

      owner=upper('BDSQL')

      and table_name='ORA_STORE_SALES_PART';

       

      PARTITION_NAME                                                             LAST_ANALYZED

      ------------------------------------------------------------------------ -------------------

      P_288436660                                                             02-05-2019 00:13:40

      P_484070065                                                             02-05-2019 00:13:40

      NEW_PARTITION

      ----------------------------------------------------------------------

      10) run procedure for gathering statistics (it may happen automatically during your maintain window):

      ----------------------------------------------------------------------

      SQL> begin

          DBMS_STATS.GATHER_TABLE_STATS (ownname => 'BDSQL',

                                         tabname => 'ORA_STORE_SALES_PART');

      end;

      ----------------------------------------------------------------------

      11) validate that LAST_ANALIZED changed only for last partition:

      ----------------------------------------------------------------------

      SQL> select PARTITION_NAME, LAST_ANALYZED from ALL_TAB_STATISTICS

      where

      owner=upper('BDSQL')

      and table_name='ORA_STORE_SALES_PART';

       

      PARTITION_NAME                                                             LAST_ANALYZED

      ------------------------------------------------------------------------ -------------------

      P_288436660                                                             02-05-2019 00:13:40

      P_484070065                                                             02-05-2019 00:13:40

      NEW_PARTITION                                                           02-05-2019 00:24:26

      ----------------------------------------------------------------------
      this output shows that procedure of gathering statistics don't touch the previously analyzed partitions and only touch the new one.
      Hope it's something that you want to achieve.