import os
import pathlib

import plotly
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# import warnings
# warnings.filterwarnings("ignore")


ROOTDIR = pathlib.Path(os.path.abspath('.'))
IMG_HTML = ROOTDIR / 'img-html'
IMG_PNG = ROOTDIR / 'img-png'
IMG_SVG = ROOTDIR / 'img-svg'
DATA_RAW = ROOTDIR / 'data-raw'
DATA_COOKED = ROOTDIR / 'data-processed'


area_bound = 2500
dis_threshold = 10
macro_base_station = {"coverage": 30, "cost": 10}
micro_base_station = {"coverage": 10, "cost": 1}


weak_grid_data = pd.read_csv(DATA_RAW / '附件1 弱覆盖栅格数据(筛选).csv')
exist_site_data = pd.read_csv(DATA_RAW / '附件2 现网站址坐标(筛选).csv')


# TODO 查看附件1数据
print(weak_grid_data.info())
print(weak_grid_data.describe())
weak_grid_data  # 坐标x，坐标y，业务量

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182807 entries, 0 to 182806
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   x        182807 non-null  int64  
 1   y        182807 non-null  int64  
 2   traffic  182807 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 4.2 MB
None
                   x              y        traffic
count  182807.000000  182807.000000  182807.000000
mean     1395.314528     995.012877      38.599343
std       783.230529     733.291862     336.383875
min         0.000000       0.000000       0.000192
25%       546.000000     348.000000       0.596106
50%      1678.000000     869.000000       3.604328
75%      2048.000000    1453.000000      17.901928
max      2499.000000    2499.000000   47795.011719


# TODO 查看附件2数据
print(exist_site_data.info())
print(exist_site_data.describe())
exist_site_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1474 entries, 0 to 1473
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      1474 non-null   int64
 1   x       1474 non-null   int64
 2   y       1474 non-null   int64
dtypes: int64(3)
memory usage: 34.7 KB
None
                id            x            y
count  1474.000000  1474.000000  1474.000000
mean   4175.835821  1332.259837  1389.444369
std    2371.172197   704.638569   695.610340
min       1.000000     1.000000     1.000000
25%    2129.500000   763.250000   834.000000
50%    4205.000000  1303.500000  1498.500000
75%    6216.750000  1970.000000  1948.750000
max    8286.000000  2499.000000  2499.000000


def check_point(x, y, exist_site_data=exist_site_data):
    """检查某点是否已经存在基站"""
    return x in exist_site_data.loc[:, "x"].values and y in exist_site_data.loc[:, "y"].values
check_point(1585, 898), check_point(818, 2020)  # False, True

(False, True)


def plot_weak_grid_data(data=weak_grid_data):
    """绘制 弱覆盖栅格 的散点图"""
#     import plotly.express as px
    fig = px.scatter(data_frame=data, x="x", y="y")
    fig.update_traces(marker=dict(color='blue', size=1))
    return fig

fig = plot_weak_grid_data()
fig.write_html(IMG_HTML / 'question1-weak_grid.html')
fig.write_image(IMG_PNG / 'question1-weak_grid.png')
fig.write_image(IMG_SVG / 'question1-weak_grid.svg')
fig.show()  # 图很大，不建议使用 notebook 查看，建议使用图片查看器/浏览器查看
del fig


def plot_exist_site_data():
    """绘制 现网站址坐标 的散点图"""
#     import plotly.express as px
    fig = px.scatter(data_frame=exist_site_data, x="x", y="y")
    fig.update_traces(marker=dict(color='red', size=3))
    return fig

fig = plot_exist_site_data()
fig.write_html(IMG_HTML / 'question1-exist_site.html')
fig.write_image(IMG_PNG / 'question1-exist_site.png')
fig.write_image(IMG_SVG / 'question1-exist_site.svg')
fig.show()  # 图很大，不建议使用 notebook 查看，建议使用图片查看器/浏览器查看
del fig


def plot_all_data():
    """绘制 所有数据（弱覆盖栅格、现网站址坐标） 的散点图"""
#     import plotly.express as px
#     import plotly.graph_objs as go
#     from plotly.offline import init_notebook_mode, iplot
#     init_notebook_mode(connected=True)
    trace1 = go.Scatter(
        mode='markers', 
        x=weak_grid_data.loc[:, "x"], y=weak_grid_data.loc[:, "y"],
        marker={"color": "blue", "size": 1}, 
        name='弱覆盖点',
    )
    trace2 = go.Scatter(
        mode='markers', 
        x=exist_site_data.loc[:, "x"], y=exist_site_data.loc[:, "y"],
        marker={"color": "red", "size": 3}, 
        name="现网基站",
    )
    data = [trace1, trace2]
    fig = go.Figure(data=data)
    return fig

fig = plot_all_data()
fig.write_html(IMG_HTML / 'question1-all.html')
fig.write_image(IMG_PNG / 'question1-all.png')
fig.write_image(IMG_SVG / 'question1-all.svg')
# fig.show()  # 图很大，不建议使用 notebook 查看，建议使用图片查看器/浏览器查看
del fig


# TODO 根据单位成本业务量计算每个基站的类型（直接评价）（不使用 jit 即时编译加速计算）（已经弃用）
def out_of_use():
    """已弃用代码，最终使用的代码在后面"""
    import tqdm
    import numpy as np
    import pandas as pd

    exist_site = exist_site_data.copy()
    weak_grid = weak_grid_data.copy()
    coverage = [10, 30]
    cost = [1, 10]

    def is_far(i, j, cover):
        return abs(exist_site.iloc[i, 1] - weak_grid.iloc[j, 0]) > cover or \
                abs(exist_site.iloc[i, 2] - weak_grid.iloc[j, 1]) > cover

    def is_in(i, j, cover):
        return np.sqrt((exist_site.iloc[i, 1] - weak_grid.iloc[j, 0])**2 + \
                        (exist_site.iloc[i, 2] - weak_grid.iloc[j, 1])**2) < cover

    base_site_type = []
    for i in tqdm.trange(len(exist_site)):
        business_cost = []
        for idx, c in enumerate(coverage):
            business_cost_tmp = 0
            for j in range(len(weak_grid)):
                if not is_far(i, j, c) and is_in(i, j, c):
                    business_cost_tmp += weak_grid.iloc[j, 2]
            business_cost_tmp /= cost[idx]
            business_cost.append(business_cost_tmp)
        base_site_type.append(1 if business_cost[0] > business_cost[1] else 2)
    base_site_type


# TODO 根据单位成本业务量计算每个基站的类型（直接评价）（使用 jit 即时编译加速计算）（这块代码只需要运行一次）
# （因为改代码用于处理数据，故只需要运行一次即可）

import numba as nb
from numba import njit, prange, jit
from time import time

import warnings
warnings.filterwarnings("ignore")  # 忽略 jit 的警告

exist_site = np.array(exist_site_data).copy()  # 深拷贝，不改动原始数据
weak_grid = np.array(weak_grid_data).copy()  # 深拷贝，不改动原始数据
business_volume = 0
coverage = [10, 30]  # 微、宏基站的覆盖范围
cost = [1, 10]  # 微、宏基站的成本

def is_far(i, j, cover, exist_site=exist_site, weak_grid=weak_grid):
    """判断某个现网站址和某个弱覆盖栅格在 x 方向或者 y 方向的距离是否已经大于基站的覆盖范围
    :param i: 现网站址的索引
    :param j: 弱覆盖栅格的索引
    :param cover: 基站的覆盖范围
    :return: boolean
        使用 numpy
    """
    distance_x = abs(exist_site[i, 1] - weak_grid[j, 0])
    distance_y = abs(exist_site[i, 2] - weak_grid[j, 1])
    return distance_x > cover or distance_y > cover

def is_in(i, j, cover, exist_site=exist_site, weak_grid=weak_grid):
    """判断某个现网站址和某个弱覆盖栅格的距离是否已经大于基站的覆盖范围
    :param i: 现网站址的索引
    :param j: 弱覆盖栅格的索引
    :param cover: 基站的覆盖范围
    :return: boolean
        使用 numpy
    """
    distance = np.sqrt((exist_site[i, 1] - weak_grid[j, 0])**2 + (exist_site[i, 2] - weak_grid[j, 1])**2)
#     distance = ((exist_site[i, 1] - weak_grid[j, 0])**2 + (exist_site[i, 2] - weak_grid[j, 1])**2)**0.5
    return distance < cover

# @njit(parallel=True)
# @jit(nopython=True)
# @njit
@jit  # 使用 jit 即时编译加速，用时大约 1h（不使用 jit 即使编译加速，用时大约 6h）
def cal_base_site_type():
    """根据 单位成本业务量(业务量/成本) 计算并返回 每个基站的类型
        如果微基站的单位成本业务量高，那么就选微基站，即类型为 1
        如果宏基站的单位成本业务量高，那么就选宏基站，即类型为 2
    :return: list 按照 id 顺序的每个基站的类型
    """
    t00 = time()
    l = len(exist_site)
    base_site_type = []
    for i in prange(l):  # numba: prange 并行计算，提高计算速度
        t0 = time()
        business_cost = []
        flag = 0
        for c in coverage:  # [10, 30]
            business_cost_tmp = 0
            for j in prange(len(weak_grid)):
                if not is_far(i, j, c) and is_in(i, j, c):
                    # trick: 先使用计算简单的，再使用计算复杂的，提高计算速度
                    business_cost_tmp += weak_grid[j, 2]
            business_cost_tmp /= cost[flag]
            flag += 1
            business_cost.append(business_cost_tmp)
        # business_cost[0] 微基站的单位成本业务量大，类型为1
        # business_cost[1] 宏基站的单位成本业务量大，类型为2
        base_site_type.append(1 if business_cost[0] > business_cost[1] else 2)
        if i % 100 == 0:  # jit 里面不太兼容 tqdm，所以手动打印查看进度
            t1 = time()
            _stars = '[' + '*' * (i // 100) + '.' * (l // 100 - i // 100) + ']'
            print("%4d / %4d" % (i, l), _stars, round(t1 - t0, 2), "s/iter")
    t11 = time()
    print('运行完毕，总用时：', t11 - t00)
    return base_site_type

# todo 获得基站类型的数据
base_site_list = cal_base_site_type()  # 跑一次这个大概要用 1h，已经提供运行完毕的文件，直接读取即可
print(base_site_list[:100])

def save_exist_site_type_data(
    exist_site=exist_site, 
    base_site_list=base_site_list, 
    columns=list(exist_site_data.columns)
):
    """保存并返回现网基站类型数据
    :param exist_site: np.array
    :param base_site_list: list
    :return: exist_site_type (pd.DataFrame)
    """
    exist_site_type = np.hstack((exist_site, np.array(base_site_list)[None, :].T))  # np.array
    exist_site_type_data = pd.DataFrame(exist_site_type, columns=columns + ['type'])  # pd.DataFrame
    
    exist_site_type_data.to_csv(DATA_COOKED / "question1-exist_site_type_data.csv")  # save

    return exist_site_type_data

# 保存读取
exist_site_type = save_exist_site_type_data()
exist_site_type

   0 / 1474 [..............] 3.7 s/iter
 100 / 1474 [*.............] 6.76 s/iter
 200 / 1474 [**............] 5.25 s/iter
 300 / 1474 [***...........] 2.76 s/iter
 400 / 1474 [****..........] 2.71 s/iter
 500 / 1474 [*****.........] 2.71 s/iter
 600 / 1474 [******........] 2.21 s/iter
 700 / 1474 [*******.......] 2.2 s/iter
 800 / 1474 [********......] 2.18 s/iter
 900 / 1474 [*********.....] 2.18 s/iter
1000 / 1474 [**********....] 2.2 s/iter
1100 / 1474 [***********...] 2.18 s/iter
1200 / 1474 [************..] 2.23 s/iter
1300 / 1474 [*************.] 2.17 s/iter
1400 / 1474 [**************] 2.04 s/iter
运行完毕，总用时： 4212.941307544708
[2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2]


# # 保存读取
# exist_site_type = save_exist_site_type_data()
# exist_site_type

# 直接读取
exist_site_type = pd.read_csv(DATA_COOKED / "question1-exist_site_type_data.csv", index_col=0)
exist_site_type


# 展示类型1数据
exist_site_type[exist_site_type.iloc[:, 3] == 1]


# 展示类型2数据
exist_site_type[exist_site_type.iloc[:, 3] == 2]


def plot_diff_exist_site_type(exist_site_type=exist_site_type):
    """绘制、保存两种现网基站散点图"""
    trace_hong = go.Scatter(
        mode="markers", 
        x=exist_site_type[exist_site_type.iloc[:, 3] == 2].iloc[:, 1], 
        y=exist_site_type[exist_site_type.iloc[:, 3] == 2].iloc[:, 2], 
        marker={"color": "orange"},
        name='宏基站', 
    )
    trace_wei = go.Scatter(
        mode="markers", 
        x=exist_site_type[exist_site_type.iloc[:, 3] == 1].iloc[:, 1], 
        y=exist_site_type[exist_site_type.iloc[:, 3] == 1].iloc[:, 2], 
        marker={"color": "yellow"},
        name='微基站', 
    )
    data = [trace_wei, trace_hong]
    layout = go.Layout(title='现网基站的两种类型基站散点图')
    fig = go.Figure(data=data, layout=layout)
    fig.write_html(IMG_HTML / 'question1-exist_site_2types.html')
    fig.write_image(IMG_PNG / 'question1-exist_site_2types.png')
    fig.write_image(IMG_SVG / 'question1-exist_site_2types.svg')
    fig.show()
    return None


plot_diff_exist_site_type()


# TODO 计算出要除去已经被现网点覆盖的弱覆盖点（使用 jit 即时编译加速计算，因为改代码用于处理数据，故只需要运行一次即可）
# 为什么重复代码？
# ①有些参数不一样，查看、修改参数方便
# ②由于 notebook 无法跳转代码，所以把所需的变量、函数写在一个 ceil 里面，方便

import numba as nb
from numba import njit, prange, jit

import warnings
warnings.filterwarnings("ignore")  # 忽略 jit 的警告

exist_site = np.array(exist_site_type).copy()  # 跟上面不同，这里包含类型！！！
weak_grid = np.array(weak_grid_data).copy()
print(f"exist_site.shape: {exist_site.shape}\nweak_grid.shape: {weak_grid.shape}")

to_be_delete = [0 for _ in range(len(weak_grid))]  # 即将被“删除”的弱覆盖点的索引
coverage = [10, 30]
cost = [1, 10]

def is_far(i, j, cover):
    return abs(exist_site[i, 1] - weak_grid[j, 0]) > cover or \
            abs(exist_site[i, 2] - weak_grid[j, 1]) > cover
def is_in(i, j, cover):
    return np.sqrt((exist_site[i, 1] - weak_grid[j, 0])**2 + \
                    (exist_site[i, 2] - weak_grid[j, 1])**2) < cover

@jit  # 使用 jit 即时编译加速，用时大约 17min（不使用 jit 即使编译加速，用时大约 ?h，不知道没试过）
def del_weak_grid():
    """删除已经被现网点覆盖的弱覆盖点
    :return: None
        已经修改全局变量 to_be_delete
    """
    global to_be_delete
    
    t00 = time()
    l = len(exist_site)
    existing_business_volume = 0
    for i in prange(l):
        t0 = time()
        c = coverage[exist_site[i, 3] - 1]
        for j in prange(len(weak_grid)):
            if not is_far(i, j, c) and is_in(i, j, c):
                to_be_delete[j] = 1
                existing_business_volume += weak_grid[j, 2]
        if i % 100 == 0:  # jit 里面不太兼容 tqdm，所以手动打印查看进度
            t1 = time()
            _stars = '[' + '*' * (i // 100) + '.' * (l // 100 - i // 100) + ']'
            print("%4d / %4d" % (i, l), _stars, round(t1 - t0, 2), "s/iter")
    t11 = time()
    print('运行完毕，总用时：', t11 - t00)
    return None

del_weak_grid()  # 跑一次这个大概要用 1h，已经提供运行完毕的文件，直接读取即可

def save_remain_weak_grid_data(
    weak_grid_data=weak_grid_data, 
    to_be_delete=to_be_delete, 
):
    remain_weak_grid = pd.concat([weak_grid_data, pd.DataFrame(to_be_delete)], axis=1)
    remain_weak_grid.columns = list(weak_grid_data.columns) + ['remained']
    
    remain_weak_grid.to_csv(DATA_COOKED / "question1-remain_weak_grid_data.csv")  # save

    return remain_weak_grid

# 保存读取
remain_grid = save_remain_weak_grid_data()
remain_grid

exist_site.shape: (1474, 4)
weak_grid.shape: (182807, 3)
   0 / 1474 [..............] 0.61 s/iter
 100 / 1474 [*.............] 0.61 s/iter
 200 / 1474 [**............] 0.74 s/iter
 300 / 1474 [***...........] 0.59 s/iter
 400 / 1474 [****..........] 0.65 s/iter
 500 / 1474 [*****.........] 0.64 s/iter
 600 / 1474 [******........] 0.61 s/iter
 700 / 1474 [*******.......] 0.61 s/iter
 800 / 1474 [********......] 0.62 s/iter
 900 / 1474 [*********.....] 0.61 s/iter
1000 / 1474 [**********....] 0.61 s/iter
1100 / 1474 [***********...] 0.59 s/iter
1200 / 1474 [************..] 0.61 s/iter
1300 / 1474 [*************.] 0.61 s/iter
1400 / 1474 [**************] 0.61 s/iter
运行完毕，总用时： 920.439487695694


# # 保存读取
# remain_grid = save_remain_weak_grid_data()
# remain_grid

# 文件读取
remain_grid = pd.read_csv(DATA_COOKED / "question1-remain_weak_grid_data.csv", index_col=0)
remain_grid  # delete or not


# TODO 查看ba
cond = remain_grid.iloc[:, 3] == 1  # 删选出
print(f"已覆盖弱站点的业务量占比：{remain_grid[cond].sum()[2] / remain_grid.sum()[2] * 100}%")
total_bussiness = remain_grid.sum()[2]
print(f"总业务量：{total_bussiness}")
covered_business = remain_grid[cond].sum()[2]
print(f"已经覆盖的业务量：{covered_business}")
uncovered_bussiness = total_bussiness - covered_business
print(f"未覆盖的业务量：{uncovered_bussiness}")
need_to_cover_bussiness = total_bussiness * 0.9 - covered_business
print(f"需要覆盖的业务量：{need_to_cover_bussiness}")

已覆盖弱站点的业务量占比：46.345035166180985%
总业务量：7056230.114628
已经覆盖的业务量：3270212.3280309997
未覆盖的业务量：3786017.7865970004
需要覆盖的业务量：3080394.7751342


# todo plot 3 kinds point (suggest using pucharm to plot, opened with HTML)
def plot_diff_grid():
#     import plotly.graph_objs as go
#     from plotly.offline import init_notebook_mode, iplot
#     init_notebook_mode(connected=True)
    
    remain_cond = remain_grid.iloc[:, 3] == 0
    delete_cond = remain_grid.iloc[:, 3] == 1
    trace1 = go.Scatter(
        mode='markers', 
        x=remain_grid[remain_cond].loc[:, 'x'], y=remain_grid[remain_cond].loc[:, 'y'],
        marker={"color": "blue", "size": 1}, 
        name='未覆盖的弱覆盖点', 
    )
    trace2 = go.Scatter(
        mode='markers', 
        x=remain_grid[delete_cond].loc[:, 'x'], y=remain_grid[delete_cond].loc[:, 'y'],
        marker={"color": "orange", "size": 1}, 
        name='已覆盖的弱覆盖点', 
    )
    trace3 = go.Scatter(
        mode='markers',
        x=exist_site_data.loc[:, "x"], y=exist_site_data.loc[:, "y"],
        marker={"color": "red", "size": 2},
        name="现网基站",
    )
    data = [trace1, trace2, trace3]
    fig = go.Figure(data=data)
    return fig
fig = plot_diff_grid()
fig.write_html(IMG_HTML / 'question1-weak_grid(remained)&exist_site.html')
fig.write_image(IMG_PNG / 'question1-weak_grid(remained)&exist_site.png')
fig.write_image(IMG_SVG / 'question1-weak_grid(remained)&exist_site.svg')
# fig.show()  # 图很大，不建议使用 notebook 查看，建议使用图片查看器/浏览器查看
del fig


# TODO 重新读取数据
exist_site_type_data = pd.read_csv(DATA_COOKED / "question1-exist_site_type_data.csv", index_col=0)
remain_weak_grid_data = pd.read_csv(DATA_COOKED / "question1-remain_weak_grid_data.csv", index_col=0)
print(f"exist_site_type_data.shape: {exist_site_type_data.shape}")
print(f"remain_weak_grid_data.shape: {remain_weak_grid_data.shape}")

exist_site_type_data.shape: (1474, 4)
remain_weak_grid_data.shape: (182807, 4)


from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn import metrics


# TODO 查看数据
remain_cond = remain_weak_grid_data.iloc[:, 3] == 0
delete_cond = remain_weak_grid_data.iloc[:, 3] == 1
remain_weak_grid_data[remain_cond], remain_weak_grid_data[delete_cond]

(           x     y     traffic  remained
 0         66  1486  140.581390         0
 1         67  1486  140.518829         0
 4        284  1486   71.528404         0
 6       1400  1486   17.549461         0
 7       1418  1486    5.520310         0
 ...      ...   ...         ...       ...
 182802  2350  2123    0.178571         0
 182803  2353  2123    5.159708         0
 182804  2354  2123    5.134017         0
 182805  2355  2123    2.599999         0
 182806  2372  2123   57.814999         0
 
 [136939 rows x 4 columns],
            x     y    traffic  remained
 2        177  1486  48.919178         1
 3        187  1486   4.322495         1
 5        309  1486   0.073663         1
 10      1483  1486   8.716330         1
 11      1554  1486   0.885215         1
 ...      ...   ...        ...       ...
 182786  2003  2123   1.830510         1
 182787  2004  2123   9.870605         1
 182788  2005  2123  12.248288         1
 182789  2202  2123   5.386420         1
 182799  2300  2123  15.452216         1
 
 [45868 rows x 4 columns])


import numpy as np
import pandas as pd
from time import time
from sklearn.cluster import KMeans
from numba import jit, njit, prange
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

class BestSites(object):
    def __init__(self, n_clusters=100):
        self.coverage = [10, 30]
        self.cost = [1, 10]
        self.bussiness = 0
        self.total_bussiness = 7056230.114628
        self.target_bussiness = 3080394.7751342
        self.mod = 200
        
        remain_weak_grid_data = pd.read_csv(DATA_COOKED / "question1-remain_weak_grid_data.csv", index_col=0)
        remain_cond = remain_weak_grid_data.iloc[:, 3] == 0
        self.remain_weak_grid_data = remain_weak_grid_data[remain_cond]
        
        self.cluster_centers_ = None
        self.labels_ = None
        self.remain_weak_grid_labels = None
#         self.n_clusters = n_clusters
        
        self.new_site_point_m2 = None
        
    # 自定义 n_clusters
    @property
    def n_clusters(self):
        return self._n_clusters
    @n_clusters.setter
    def n_clusters(self, n):
        self._n_clusters = n
        return None
    
    # todo 方法 2 从大到小直接选择最优站点（使用该方法）
    @jit
    def chose_best_point(self):
        remain_weak_grid = pd.DataFrame(self.remain_weak_grid_data.copy())
        remain_weak_grid.sort_values(by='traffic', ascending=False, inplace=True, ignore_index=True)
        print(remain_weak_grid)
        remain_weak_grid = np.array(remain_weak_grid)
        done_bussiness = 0
        new_site_point_m2 = []
        for i in prange(len(remain_weak_grid)):
            if done_bussiness >= self.target_bussiness:
                print("total_done_bussiness:", done_bussiness)
                self.new_site_point_m2 = new_site_point_m2
                return pd.DataFrame(new_site_point_m2, columns=["x", "y", 'type'])
            
            if remain_weak_grid[i, 3] == 0:
                remain_weak_grid[i, 3] = 1
                done_bussiness += remain_weak_grid[i, 2]
                x1, y1 = int(round(remain_weak_grid[i, 0])), int(round(remain_weak_grid[i, 1]))
#                 flag = np.random.randint(2)
                flag = 1 if i % 2 == 0 or i % 5 == 0 else 0
                new_site_point_m2.append([x1, y1, flag + 1])
                c = coverage[flag]

                for j in prange(len(remain_weak_grid)):
                    if remain_weak_grid[j, 3] == 0:
                        x2, y2 = remain_weak_grid[j, 0], remain_weak_grid[j, 1]
                        if not self.is_far(x1, y1, x2, y2, c) and self.is_in(x1, y1, x2, y2, c):
                            remain_weak_grid[j, 3] = 1
                            done_bussiness += remain_weak_grid[j, 2]

            if i % 300 == 0:
                print("i:", i, "business:", done_bussiness)
        self.new_site_point_m2 = new_site_point_m2.copy()
        return pd.DataFrame(new_site_point_m2, columns=["x", "y", 'type'])

    # todo 方法 1 使用kmeans聚类选择最优站点（未使用该方法，因为①耗时②选择站点效果不好）
#     @jit
    def clustering_kmeans(self):
        """计算 kmeans 聚类结果"""
        k_means = KMeans(n_clusters=self.n_clusters, random_state=10)
        print(k_means)
        time_begin = time()
        k_means.fit(self.remain_weak_grid_data.iloc[:, :2].values)
        
        self.cluster_centers_ = k_means.cluster_centers_.copy()
        self.labels_ =  k_means.labels_.copy()
        
        data_frame = self.remain_weak_grid_data.copy()
        data_frame.insert(3, "labels", self.labels_)
        data_frame.drop(columns="remained", axis=1, inplace=True)
#         print(data_frame)
        self.remain_weak_grid_labels = np.array(data_frame.copy())
        
        print(f"kmeans cost time: {time() - time_begin}")
        return k_means.cluster_centers_, k_means.labels_
    def plot_kmeans_res(self):
        """绘制 kmeans 聚类结果"""
#         print(self.remain_weak_grid_labels)
        remain_weak_grid_labels = self.remain_weak_grid_labels.copy()
        remain_weak_grid_labels = pd.DataFrame(
            remain_weak_grid_labels, 
            columns=["x", "y", "traffic", "labels"], 
        )
        fig = px.scatter(data_frame=remain_weak_grid_labels, x='x', y='y', color='labels')
        fig.update_traces(marker={"size": 1})
#         fig.show()
        return fig
#     @jit
    def is_far(self, x, y, i, j, c):
        return abs(x - i)> c or abs(y - j) > c
#     @jit
    def is_in(self, x, y, i, j, c):
        return (x - i)**2 + (y - j)**2 < c**2
    @jit
    def confirm_type(self):
        print("确定新建基站类型")
        new_base_site_type = []
        new_base_site = self.cluster_centers_
        remain_weak_grid = np.array(self.remain_weak_grid_labels)
#         print(new_base_site)
#         print(remain_weak_grid)
        for i in prange(len(new_base_site)):
            flag = 0
            bussiness_per_cost = []
            for coverage in self.coverage:
                business_per_cost_tmp = 0
                for j in prange(len(remain_weak_grid)):
                    x1, y1 = new_base_site[i, 0], new_base_site[i, 1]
                    x2, y2 = remain_weak_grid[j, 0], remain_weak_grid[j, 1]
                    c = coverage
                    if not self.is_far(x1, y1, x2, y2, c) and self.is_in(x1, y1, x2, y2, c):
                        business_per_cost_tmp += remain_weak_grid[j, 2]
                business_per_cost_tmp /= self.cost[flag]
                flag += 1
                bussiness_per_cost.append(business_per_cost_tmp)
            new_base_site_type.append(1 if bussiness_per_cost[0] > bussiness_per_cost[1] else 2)
#             if i % self.mod == 0:
#                 print("i:", i, "business:", done_bussiness)
        new_base_site_type = np.array(new_base_site_type)
#         print(new_base_site_type)
#         print(type(new_base_site_type))
#         print(self.cluster_centers_)
#         print(type(self.cluster_centers_))
        self.cluster_centers_ = np.concatenate((
            self.cluster_centers_.transpose(), 
            new_base_site_type.reshape(1, -1)
        )).transpose()
#         print(self.cluster_centers_)
#         return new_base_site_type
#     @jit
    def cal_business(self):
        print("计算新建基站覆盖的业务量")
        self.bussiness = 0
        for i in prange(len(self.cluster_centers_)):
            coverage = self.coverage[int(self.cluster_centers_[i, 2]) - 1]
            for j in prange(len(self.remain_weak_grid_labels)):
                c = coverage
                x1, y1 = self.cluster_centers_[i, 0], self.cluster_centers_[i, 1]
                x2, y2 = self.remain_weak_grid_labels[j, 0], self.remain_weak_grid_labels[j, 1]
                if not self.is_far(x1, y1, x2, y2, c) and self.is_in(x1, y1, x2, y2, c):
                    self.bussiness += self.remain_weak_grid_labels[j, 2]
            if i % self.mod == 0:
                print(i)
        print(f"需要覆盖的业务量：{self.target_bussiness}")
        print(f"新建基站覆盖的业务量：{self.bussiness}")
        print(f"新建 / 需要：{self.bussiness / self.target_bussiness * 100}%")
        print(f"新建 / 总量：{self.bussiness / self.total_bussiness * 100}%")
        return self.bussiness >= self.target_bussiness


# todo method 2: chose best point
bb = BestSites()
bb.chose_best_point()

           x     y       traffic  remained
0        932  2210  18680.960938         0
1        972  1238  15063.482422         0
2       1229  1894  12733.102539         0
3       1311  2005  12311.877930         0
4       1611  2233  11579.516602         0
...      ...   ...           ...       ...
136934  2178  2348      0.000192         0
136935  2217  2402      0.000192         0
136936  2239  2456      0.000192         0
136937  2175  2349      0.000192         0
136938  2170  2344      0.000192         0

[136939 rows x 4 columns]
i: 0 business: 19008.385769000004
i: 300 business: 1243831.1821389985
i: 600 business: 1861172.3909059966
i: 900 business: 2237787.6476250137
i: 1200 business: 2466160.5638240105
i: 1500 business: 2622924.669223021
i: 1800 business: 2761005.805976014
i: 2100 business: 2851370.820233041
i: 2400 business: 2953135.8486840385
i: 2700 business: 3024676.8779580435
total_done_bussiness: 3082428.6270970367


new_site_point_m2 = pd.DataFrame(bb.new_site_point_m2, columns=['x', 'y', 'type'])
new_site_point_m2.to_csv(DATA_COOKED / "question1-new_build_sites.csv")
new_site_point_m2


fig = px.scatter(data_frame=new_site_point_m2, x='x', y='y', color='type')
fig.update_layout(title='新建基站散点图')
fig.update_traces(marker={"size": 5})
fig.write_html(IMG_HTML / 'question1-new_build_sites.html')
fig.write_image(IMG_PNG / 'question1-new_build_sites.png')
fig.write_image(IMG_SVG / 'question1-new_build_sites.svg')
fig.show()
del fig


# todo method 1: kmeans
b = BestSites()
for n in range(100, 101):  # 这里以100为例(用时大约 1.5min)，实际上 1407 - 1419 才达到所要求的 90%，用时 4min
    b.n_clusters = n
    b.clustering_kmeans()    # kmeans
    time1 = time()
    b.confirm_type()         # confirm
    time2 = time()
    print(f"confirm cost time: {time2 - time1}s")
    flag = b.cal_business()  # business
    time3 = time()
    print(f"calculatet cost time: {time3 - time2}s")
    if flag:
        break

KMeans(n_clusters=100, random_state=10)
kmeans cost time: 21.72585892677307
确定新建基站类型
confirm cost time: 43.21766185760498s
计算新建基站覆盖的业务量
0
需要覆盖的业务量：3080394.7751342
新建基站覆盖的业务量：285833.464153
新建 / 需要：9.279117938399548%
新建 / 总量：4.050795672896914%
calculatet cost time: 17.47350525856018s


fig = b.plot_kmeans_res()
fig.update_layout(title='kmeans聚类(k=100)的结果')
fig.update_traces(marker={"size": 3})
fig.show()


new_base_site_type = b.cluster_centers_  # 中心点
remain_weak_grid_labels = b.remain_weak_grid_labels  # 保留的弱覆盖的聚类标签数据

new_base_site_type = pd.DataFrame(new_base_site_type, columns=['x', 'y', 'type'])
fig = px.scatter(data_frame=new_base_site_type, x='x', y='y', color='type')
fig.update_layout(title='新建基站散点图')
fig.update_traces(marker={"size": 10})
fig.show()


# todo plot（done）

trace1 = go.Scatter(mode='markers', x=new_site_point_m2['x'], y=new_site_point_m2['y'], 
                    name='新建基站', marker={"color":"red", "size":5})
trace2 = go.Scatter(mode='markers', x=exist_site_data['x'], y=exist_site_data['y'], 
                    name='现有基站',marker={"color":"pink", "size":5})
trace3 = go.Scatter(mode='markers', x=weak_grid_data['x'], y=weak_grid_data['y'], 
                    name='弱覆盖点',marker={"color":"cyan", "size":1})
data = [trace3, trace2, trace1]
fig = go.Figure(data=data)
fig.write_html(IMG_HTML / "question1-new_exist_weak_3kinds_sites.html")
fig.write_image(IMG_PNG / 'question1-new_exist_weak_3kinds_sites.png')
fig.write_image(IMG_SVG / 'question1-new_exist_weak_3kinds_sites.svg')
# fig.show()  # 不建议在 notebook 查看该图
del fig


kmeans100 = KMeans(n_clusters=100)
kmeans100.fit(weak_grid_data.iloc[:, :2])
pd.DataFrame(kmeans100.cluster_centers_)


weak_grid_data_cp = pd.concat([weak_grid_data.copy(), pd.DataFrame(kmeans100.labels_)], axis=1)
weak_grid_data_cp.rename(inplace=True, columns={0: "labels"})
weak_grid_data_cp


fig = px.scatter(data_frame=weak_grid_data_cp, x='x', y='y', color='labels')
fig.update_traces(marker={"size": 3})
fig.write_html(IMG_HTML / 'question1-kmeans100.html')
fig.write_image(IMG_PNG / 'question1-kmeans100.png')
fig.write_image(IMG_SVG / 'question1-kmeans100.svg')
# fig.show()  # 不建议在 notebook 上运行该行代码
del fig


# todo 这个忘了干啥用的了，好像是查看聚类和新建的数据？
newnew_site_data = pd.DataFrame(kmeans100.cluster_centers_, columns=["x", "y"])
nest_site_data = pd.read_csv(DATA_COOKED / "question1-best_base_site_points.csv", index_col=0)
total_site = pd.concat([newnew_site_data, nest_site_data])
total_site


# todo cal 343 points and sort（done）
from sklearn.cluster import KMeans

kmeans343 = KMeans(n_clusters=343)
kmeans343.fit(weak_grid_data.iloc[:, :2])

weak_grid_data_cp = pd.concat([weak_grid_data.copy(), pd.DataFrame(kmeans343.labels_)], axis=1)
weak_grid_data_cp.rename(inplace=True, columns={0: "labels"})
weak_grid_data_cp

weak_grid_data_cp_size = []  # 343 个数，分别为 343 个类包含的点的个数
for name, group in weak_grid_data_cp.groupby(by='labels'):
    l = len(group)
    weak_grid_data_cp_size.append(l)
# print(len(weak_grid_data_cp_size))

weak_grid_data_cp_size = pd.DataFrame(weak_grid_data_cp_size).sort_values(by=0, ascending=False)
# print(weak_grid_data_cp_size.describe())
weak_grid_data_cp_size.T


weak_grid_data_cp = pd.concat([weak_grid_data.copy(), pd.DataFrame(kmeans343.labels_)], axis=1)
weak_grid_data_cp.rename(inplace=True, columns={0: "labels"})
weak_grid_data_cp


fig = px.scatter(data_frame=weak_grid_data_cp, x='x', y='y', color='labels')
fig.update_traces(marker={"size": 3})
fig.write_html(IMG_HTML / 'question1-kmeans343.html')
fig.write_image(IMG_PNG / 'question1-kmeans343.png')
fig.write_image(IMG_SVG / 'question1-kmeans343.svg')
# fig.show()  # 不建议在 notebook 上运行该行代码
del fig


# todo 这个忘了干啥用的了，好像是查看聚类和新建的数据？
newnew_site_data = pd.DataFrame(kmeans343.cluster_centers_, columns=["x", "y"])
nest_site_data = pd.read_csv(DATA_COOKED / "question1-best_base_site_points.csv", index_col=0)
total_site = pd.concat([newnew_site_data, nest_site_data])
total_site

	id	x	y
0	1	818	2020
1	4	713	2013
2	33	2305	291
3	35	700	1953
4	36	949	2293
...	...	...	...
1469	8254	2324	1625
1470	8257	1135	852
1471	8278	2053	1818
1472	8282	1432	1797
1473	8286	1584	898

	id	x	y	type
6	55	1787	1403	1
9	71	1359	1514	1
13	97	2491	758	1
48	358	2310	224	1
53	415	2104	1824	1
...	...	...	...	...
1448	8164	2143	1289	1
1449	8165	1828	279	1
1452	8188	1666	1230	1
1456	8200	2245	1436	1
1457	8206	1956	245	1

	0	1
0	87.753616	1315.726240
1	1883.493878	708.474212
2	1986.039390	2405.806600
3	1040.561162	262.899083
4	259.168357	2149.452333
...	...	...
95	622.049133	769.579961
96	1154.357724	1824.552846
97	428.804945	1170.425824
98	759.863463	125.238667
99	1555.964988	1408.413084

	x	y	type
0	87.753616	1315.726240	NaN
1	1883.493878	708.474212	NaN
2	1986.039390	2405.806600	NaN
3	1040.561162	262.899083	NaN
4	259.168357	2149.452333	NaN
...	...	...	...
338	709.000000	455.000000	1.0
339	1627.000000	2404.000000	1.0
340	819.000000	564.000000	1.0
341	814.000000	610.000000	1.0
342	1.000000	1814.000000	1.0

	x	y	type
0	1966.755000	2162.870833	NaN
1	508.121160	51.238055	NaN
2	1886.529510	713.724023	NaN
3	248.806950	1057.656371	NaN
4	1412.564994	316.087516	NaN
...	...	...	...
338	709.000000	455.000000	1.0
339	1627.000000	2404.000000	1.0
340	819.000000	564.000000	1.0
341	814.000000	610.000000	1.0
342	1.000000	1814.000000	1.0

问题1¶

读取并观察数据¶

可视化数据¶

求解模型¶

根据单位成本业务量计算每个基站的类型¶

除去已经被现网点覆盖的弱覆盖点¶

聚类/直接选取站点¶

直接选取（使用该方法）¶

聚类选取（未使用该方法，可忽略）¶

绘制三种基站（现有、弱覆盖、新建）散点图¶

其他算法/模型（在分析数据的时候写的）¶

对弱覆盖栅格数据进行 kmeans 聚类(k=100)¶

绘制 kmean 聚类(k=100)图所有点的散点图¶

其他¶

对弱覆盖栅格数据进行 kmeans 聚类(k=343)¶

绘制 kmeans 聚类(k=343)所有点的散点图¶

其他¶

	x	y	traffic
0	66	1486	140.581390
1	67	1486	140.518829
2	177	1486	48.919178
3	187	1486	4.322495
4	284	1486	71.528404
...	...	...	...
182802	2350	2123	0.178571
182803	2353	2123	5.159708
182804	2354	2123	5.134017
182805	2355	2123	2.599999
182806	2372	2123	57.814999

	x	y	type
0	932	2210	2
1	972	1238	1
2	1229	1894	2
3	1311	2005	1
4	1611	2233	2
...	...	...	...
783	1171	812	1
784	250	1065	1
785	2353	24	1
786	2422	583	2
787	2249	1453	2