How to use jupyter nbconvert
最近在使用jupyter notebook的时候,发现notebook文件在问题探索方面非常方便,但是交付的话,还是期望能将其转换为python源文件。要实现notebook源文件(.ipynb)与python源文件(.py)之间的相互转换,可以使用命令jupyter nbconvert
来完服务器托管成。举例如下,
这里有一个文件名称为,内容如下:
lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat Chap14.ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyalink.alink import *n",
"useLocalEnv(1)n",
"n",
"from utils import *n",
"import osn",
"import pandas as pdn",
"n",
"pd.set_option('display.max_colwidth', 1000)n",
"n",
"DATA_DIR = ROOT_DIR + "ctr_avazu" + os.sepn",
"n",
"SCHEMA_STRINGn",
" = "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, "n",
" + "site_category string, app_id string, app_domain string, app_category string, device_id string, "n",
" + "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, "n",
" + "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int"n",
"n",
"CATEGORY_COL_NAMES = [n",
" "C1", "banner_pos", "site_category", "app_domain",n",
" "app_category", "device_type", "device_conn_type",n",
" "site_id", "site_domain", "device_id", "device_model"n",
"]n",
"n",
"NUMERICAL_COL_NAMES = ["C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]n",
"n",
"FEATURE_MODEL_FILE = "feature_model.ak"n",
"INIT_MODEL_FILE = "init_model.ak"n",
"n",
"LABEL_COL_NAME = "click"n",
"VEC_COL_NAME = "vec"n",
"PREDICTION_COL_NAME = "pred"n",
"PRED_DETAIL_COL_NAME = "pred_info"n",
"n",
"NUM_HASH_FEATURES = 30000n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_2n",
"TextSourceBatchOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-small.csv")n",
" .firstN(10)n",
" .print()n",
"n",
"trainBatchData = CsvSourceBatchOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-small.csv")n",
" .setSchemaStr(SCHEMA_STRING);n",
"n",
"trainBatchData.firstN(10).print();n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_3n",
"trainBatchData = CsvSourceBatchOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-small.csv")n",
" .setSchemaStr(SCHEMA_STRING);n",
"n",
"feature_pipeline = Pipeline()n",
" .add(n",
" StandardScaler()n",
" .setSelectedCols(NUMERICAL_COL_NAMES)n",
" )n",
" .add(n",
" FeatureHasher()n",
" .setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)n",
" .setCategoricalCols(CATEGORY_COL_NAMES)n",
" .setOutputCol(VEC_COL_NAME)n",
" .setNumFeatures(NUM_HASH_FEATURES)n",
" );n",
"n",
"if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :n",
" feature_pipelinen",
" .fit(trainBatchData)n",
" .save(DATA_DIR + FEATURE_MODEL_FILE)n",
" BatchOperator.execute()n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_4n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)n",
"n",
"data = CsvSourceStreamOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-ctr-train-8M.csv")n",
" .setSchemaStr(SCHEMA_STRING);n",
"n",
"if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :n",
" trainBatchData = CsvSourceBatchOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-small.csv")n",
" .setSchemaStr(SCHEMA_STRING);n",
"n",
" lr = LogisticRegressionTrainBatchOp()n",
" .setVectorCol(VEC_COL_NAME)n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setWithIntercept(True)n",
" .setMaxIter(10);n",
"n",
" feature_pipelineModeln",
" .transform(trainBatchData)n",
" .link(lr)n",
" .link(n",
" AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE)n",
" );n",
" BatchOperator.execute();n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_5 n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);n",
"n",
"initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);n",
"n",
"data = CsvSourceStreamOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-ctr-train-8M.csv")n",
" .setSchemaStr(SCHEMA_STRING)n",
" .setIgnoreFirstLine(True)n",
"n",
"spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);n",
"train_stream_data = feature_pipelineModel.transform(spliter);n",
"test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));n",
"n",
"model = FtrlTrainStreamOp(initModel)n",
" .setVectorCol(VEC_COL_NAME)n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setWithIntercept(True)n",
" .setAlpha(0.1)n",
" .setBeta(0.1)n",
" .setL1(0.01)n",
" .setL2(0.01)n",
" .setTimeInterval(10)n",
" .setVectorSize(NUM_HASH_FEATURES)n",
" .linkFrom(train_stream_data);n",
"n",
"predResult = FtrlPredictStreamOp(initModel)n",
" .setVectorCol(VEC_COL_NAME)n",
" .setPredictionCol(PREDICTION_COL_NAME)n",
" .setReservedCols([LABEL_COL_NAME])n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)n",
" .linkFrom(model, test_stream_data);n",
"n",
"# predResultn",
"# .sample(0.0001)n",
"# .select("'Pred Sample' AS out_type, *")n",
"# .print();n",
"n",
"predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predResultn",
" .link(n",
" EvalBinaryClassStreamOp()n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)n",
" .setTimeInterval(10)n",
" )n",
" .link(n",
" JsonValueStreamOp()n",
" .setSelectedCol("Data")n",
" .setReservedCols(["Statistics"])n",
" .setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])n",
" .setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])n",
" )n",
" .print(key="evaluation", refreshInterval = 30, maxLimit=20)n",
"# .select("'Eval Metric' AS out_type, *")n",
"# .print();n",
"n",
"StreamOperator.execute();n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#c_6n",
"data = CsvSourceStreamOp()n",
" .setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"n",
" + "data-files/avazu-ctr-train-8M.csv")n",
" .setSchemaStr(SCHEMA_STRING)n",
" .setIgnoreFirstLine(True);n",
"n",
"feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);n",
"n",
"spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);n",
"train_stream_data = feature_pipelineModel.transform(spliter);n",
"test_stream_data = feature_pipelineModel.transform(spliter.getSideO服务器托管utput(0));n",
"n",
"initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);n",
"n",
"model = FtrlTrainStreamOp(initModel)n",
" .setVectorCol(VEC_COL_NAME)n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setWithIntercept(True)n",
" .setAlpha(0.1)n",
" .setBeta(0.1)n",
" .setL1(0.01)n",
" .setL2(0.01)n",
" .setTimeInterval(10)n",
" .setVectorSize(NUM_HASH_FEATURES)n",
" .linkFrom(train_stream_data);n",
"n",
"model_filter = FtrlModelFilterStreamOp()n",
" .setPositiveLabelValueString("1")n",
" .setVectorCol(VEC_COL_NAME)n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setAccuracyThreshold(0.83)n",
" .setAucThreshold(0.71)n",
" .linkFrom(model, train_stream_data);n",
"n",
"model_filtern",
" .select("'Model' AS out_type, *")n",
" .print();n",
"n",
"predResult = FtrlPredictStreamOp(initModel)n",
" .setVectorCol(VEC_COL_NAME)n",
" .setPredictionCol(PREDICTION_COL_NAME)n",
" .setReservedCols([LABEL_COL_NAME])n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)n",
" .linkFrom(model_filter, test_stream_data);n",
"n",
"predResultn",
" .sample(0.0001)n",
" .select("'Pred Sample' AS out_type, *")n",
" .print();n",
"n",
"predResultn",
" .link(n",
" EvalBinaryClassStreamOp()n",
" .setPositiveLabelValueString("1")n",
" .setLabelCol(LABEL_COL_NAME)n",
" .setPredictionDetailCol(PRED_DETAIL_COL_NAME)n",
" .setTimeInterval(10)n",
" )n",
" .link(n",
" JsonValueStreamOp()n",
" .setSelectedCol("Data")n",
" .setReservedCols(["Statistics"])n",
" .setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])n",
" .setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])n",
" )n",
" .select("'Eval Metric' AS out_type, *")n",
" .print();n",
"n",
"StreamOperator.execute();n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
接下来,我们借助命令jupyter nbconvert
将其转换为.py文件,命令如下:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ mkdir -p python
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ jupyter nbconvert --to python Chap14.ipynb --output-dir python
[NbConvertApp] Converting notebook Chap14.ipynb to python
[NbConvertApp] Writing 7347 bytes to python/Chap14.py
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
我们看到,已经在python目录下生成了文件Chap14.py。
接下来,我们看一下生成的Chap14.py文件的内容:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$ cat python/Chap14.py
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
from pyalink.alink import *
useLocalEnv(1)
from utils import *
import os
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
DATA_DIR = ROOT_DIR + "ctr_avazu" + os.sep
SCHEMA_STRING
= "id string, click string, dt string, C1 string, banner_pos int, site_id string, site_domain string, "
+ "site_category string, app_id string, app_domain string, app_category string, device_id string, "
+ "device_ip string, device_model string, device_type string, device_conn_type string, C14 int, C15 int, "
+ "C16 int, C17 int, C18 int, C19 int, C20 int, C21 int"
CATEGORY_COL_NAMES = [
"C1", "banner_pos", "site_category", "app_domain",
"app_category", "device_type", "device_conn_type",
"site_id", "site_domain", "device_id", "device_model"
]
NUMERICAL_COL_NAMES = ["C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]
FEATURE_MODEL_FILE = "feature_model.ak"
INIT_MODEL_FILE = "init_model.ak"
LABEL_COL_NAME = "click"
VEC_COL_NAME = "vec"
PREDICTION_COL_NAME = "pred"
PRED_DETAIL_COL_NAME = "pred_info"
NUM_HASH_FEATURES = 30000
# In[ ]:
#c_2
TextSourceBatchOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")
.firstN(10)
.print()
trainBatchData = CsvSourceBatchOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")
.setSchemaStr(SCHEMA_STRING);
trainBatchData.firstN(10).print();
# In[ ]:
#c_3
trainBatchData = CsvSourceBatchOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")
.setSchemaStr(SCHEMA_STRING);
feature_pipeline = Pipeline()
.add(
StandardScaler()
.setSelectedCols(NUMERICAL_COL_NAMES)
)
.add(
FeatureHasher()
.setSelectedCols(CATEGORY_COL_NAMES + NUMERICAL_COL_NAMES)
.setCategoricalCols(CATEGORY_COL_NAMES)
.setOutputCol(VEC_COL_NAME)
.setNumFeatures(NUM_HASH_FEATURES)
);
if not(os.path.exists(DATA_DIR + FEATURE_MODEL_FILE)) :
feature_pipeline
.fit(trainBatchData)
.save(DATA_DIR + FEATURE_MODEL_FILE)
BatchOperator.execute()
# In[ ]:
#c_4
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE)
data = CsvSourceStreamOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")
.setSchemaStr(SCHEMA_STRING);
if not(os.path.exists(DATA_DIR + INIT_MODEL_FILE)) :
trainBatchData = CsvSourceBatchOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-small.csv")
.setSchemaStr(SCHEMA_STRING);
lr = LogisticRegressionTrainBatchOp()
.setVectorCol(VEC_COL_NAME)
.setLabelCol(LABEL_COL_NAME)
.setWithIntercept(True)
.setMaxIter(10);
feature_pipelineModel
.transform(trainBatchData)
.link(lr)
.link(
AkSinkBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE)
);
BatchOperator.execute();
# In[ ]:
#c_5
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);
initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);
data = CsvSourceStreamOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")
.setSchemaStr(SCHEMA_STRING)
.setIgnoreFirstLine(True)
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));
model = FtrlTrainStreamOp(initModel)
.setVectorCol(VEC_COL_NAME)
.setLabelCol(LABEL_COL_NAME)
.setWithIntercept(True)
.setAlpha(0.1)
.setBeta(0.1)
.setL1(0.01)
.setL2(0.01)
.setTimeInterval(10)
.setVectorSize(NUM_HASH_FEATURES)
.linkFrom(train_stream_data);
predResult = FtrlPredictStreamOp(initModel)
.setVectorCol(VEC_COL_NAME)
.setPredictionCol(PREDICTION_COL_NAME)
.setReservedCols([LABEL_COL_NAME])
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)
.linkFrom(model, test_stream_data);
# predResult
# .sample(0.0001)
# .select("'Pred Sample' AS out_type, *")
# .print();
predResult.print(key="predResult", refreshInterval = 30, maxLimit=20)
# In[ ]:
predResult
.link(
EvalBinaryClassStreamOp()
.setLabelCol(LABEL_COL_NAME)
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)
.setTimeInterval(10)
)
.link(
JsonValueStreamOp()
.setSelectedCol("Data")
.setReservedCols(["Statistics"])
.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])
.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])
)
.print(key="evaluation", refreshInterval = 30, maxLimit=20)
# .select("'Eval Metric' AS out_type, *")
# .print();
StreamOperator.execute();
# In[ ]:
#c_6
data = CsvSourceStreamOp()
.setFilePath("http://alink-release.oss-cn-beijing.aliyuncs.com/"
+ "data-files/avazu-ctr-train-8M.csv")
.setSchemaStr(SCHEMA_STRING)
.setIgnoreFirstLine(True);
feature_pipelineModel = PipelineModel.load(DATA_DIR + FEATURE_MODEL_FILE);
spliter = SplitStreamOp().setFraction(0.5).linkFrom(data);
train_stream_data = feature_pipelineModel.transform(spliter);
test_stream_data = feature_pipelineModel.transform(spliter.getSideOutput(0));
initModel = AkSourceBatchOp().setFilePath(DATA_DIR + INIT_MODEL_FILE);
model = FtrlTrainStreamOp(initModel)
.setVectorCol(VEC_COL_NAME)
.setLabelCol(LABEL_COL_NAME)
.setWithIntercept(True)
.setAlpha(0.1)
.setBeta(0.1)
.setL1(0.01)
.setL2(0.01)
.setTimeInterval(10)
.setVectorSize(NUM_HASH_FEATURES)
.linkFrom(train_stream_data);
model_filter = FtrlModelFilterStreamOp()
.setPositiveLabelValueString("1")
.setVectorCol(VEC_COL_NAME)
.setLabelCol(LABEL_COL_NAME)
.setAccuracyThreshold(0.83)
.setAucThreshold(0.71)
.linkFrom(model, train_stream_data);
model_filter
.select("'Model' AS out_type, *")
.print();
predResult = FtrlPredictStreamOp(initModel)
.setVectorCol(VEC_COL_NAME)
.setPredictionCol(PREDICTION_COL_NAME)
.setReservedCols([LABEL_COL_NAME])
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)
.linkFrom(model_filter, test_stream_data);
predResult
.sample(0.0001)
.select("'Pred Sample' AS out_type, *")
.print();
predResult
.link(
EvalBinaryClassStreamOp()
.setPositiveLabelValueString("1")
.setLabelCol(LABEL_COL_NAME)
.setPredictionDetailCol(PRED_DETAIL_COL_NAME)
.setTimeInterval(10)
)
.link(
JsonValueStreamOp()
.setSelectedCol("Data")
.setReservedCols(["Statistics"])
.setOutputCols(["Accuracy", "AUC", "ConfusionMatrix"])
.setJsonPath(["$.Accuracy", "$.AUC", "$.ConfusionMatrix"])
)
.select("'Eval Metric' AS out_type, *")
.print();
StreamOperator.execute();
# In[ ]:
(lanzhou) lwk@qwfys:~/Public/project/python/alink_tutorial_python/pyalink$
服务器托管,北京服务器托管,服务器租用 http://www.fwqtg.net
缓存就是内存中的数据,常常来自对数据库查询结果的保存,使用缓存、可以避免频繁的与数据库进行交互,进而提高响应速度一级缓存是sqlSession级别的缓存,在操作数据库时需要构造sqlsession对象,在对象中有一个数据结构(hashmap)用于存储缓存数据,…