Query-to-Distribution/util.py at master · chunwangpro/Query-to-Distribution · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import pandas as pd
from tqdm import tqdm

OPS = {
    ">": np.greater,
    "<": np.less,
    ">=": np.greater_equal,
    "<=": np.less_equal,
    "=": np.equal,
}


def generate_random_query(table, args, rng):
    """Generate a random query."""
    conditions = rng.randint(args.min_conditions, args.max_conditions + 1)
    if args.model == "1-input":
        ops = rng.choice(["<="], replace=True, size=conditions)
    elif args.model == "2-input":
        # ops = rng.choice(["<", "<=", ">", ">=", "="], replace=True, size=conditions)
        ops = rng.choice(["<="], replace=True, size=conditions)
        # ops = rng.choice(["="], replace=True, size=conditions)
    idxs = rng.choice(table.shape[1], replace=False, size=conditions)
    idxs = np.sort(idxs)
    cols = table[:, idxs]
    vals = table[rng.randint(0, table.shape[0]), idxs]
    card = calculate_query_cardinality(cols, ops, vals)
    return idxs, ops, vals, card


def column_intervalization(query_set, table_size):
    column_interval = {i: set() for i in range(table_size[1])}
    for query in query_set:
        idxs, _, vals, _ = query
        for i in range(len(idxs)):
            column_interval[idxs[i]].add(vals[i])
    # Apply the column_interval to <, <=, >, >=, =
    for k, v in column_interval.items():
        if not v:
            # use [0] for empty column interval
            column_interval[k] = [0, 1]
        else:
            interval_list = sorted(list(v))
            add_small = 2 * interval_list[0] - interval_list[1]
            add_big_1 = 2 * interval_list[-1] - interval_list[-2]
            add_big_2 = 3 * interval_list[-1] - 2 * interval_list[-2]
            column_interval[k] = [add_small] + interval_list + [add_big_1, add_big_2]
    return column_interval


def count_column_unique_interval(unique_intervals):
    # count unique query interval number for each column
    return [len(v) for v in unique_intervals.values()]


def calculate_query_cardinality(data, ops, vals):
    """
    Use ops and vals as queries to find the number of rows in data that meet the conditions.

    Parameters:
    data (2D-array): The subset of table columns involved in the query. Table columns not involved in the query are not included in data.
    ops (1D-array): A list of operators, support operators: '>', '>=', '<', '<=', '='.
    vals (1D-array): A list of values.

    Returns:
    int: The cardinality (number of rows) that satisfy the query.

    Example:
    for empty table, use np.empty((n_row, n_col), dtype=np.float32), return 0.

    for non-empty table:
    table = np.array([[1, 2, 3, 4, 5],
                      [10, 20, 30, 40, 50],
                      [10, 20, 30, 40, 50]]).T
    data = table[:, [1, 2]]
    ops = [">=", ">="]
    vals = [20, 20]
    result = calculate_query_cardinality(data, ops, vals)
    print(result)
    """

    # assert data.shape[1] == len(ops) == len(vals)
    bools = np.ones(data.shape[0], dtype=bool)
    for i, (o, v) in enumerate(zip(ops, vals)):
        bools &= OPS[o](data[:, i], v)
    return bools.sum()


def calculate_Q_error(dataNew, query_set, table_size=None):
    print("\nBegin Calculating Q-error ...")
    Q_error = []
    for query in tqdm(query_set):
        idxs, ops, vals, card_true = query
        card_pred = calculate_query_cardinality(dataNew[:, idxs], ops, vals)
        if card_pred == 0 and card_true == 0:
            Q_error.append(1)
        elif card_pred == 0:
            Q_error.append(card_true)
        elif card_true == 0:
            Q_error.append(card_pred)
        else:
            Q_error.append(max(card_pred / card_true, card_true / card_pred))
    print("Done.\n")
    return Q_error


def print_Q_error(Q_error, args, resultsPath):
    print("Summary of Q-error:")
    print(args)
    statistics = {
        "min": np.min(Q_error),
        "50": np.median(Q_error),
        "75": np.percentile(Q_error, 75),
        "90": np.percentile(Q_error, 90),
        "99": np.percentile(Q_error, 99),
        "max": np.max(Q_error),
        "mean": np.mean(Q_error),
    }
    df = pd.DataFrame.from_dict(statistics, orient="index", columns=["Q-error"])
    df.index.name = None
    df.to_csv(f"{resultsPath}/Q_error.csv", index=True, header=False)
    print(df)