201601 - JasonWayne/personal-wiki GitHub Wiki
2016年1月Code snippets
2016-01-05 18:23:08
# 'http://www.basketball-reference.com/players/b/bryanko01.html' --> bryanko01
UPDATE `sports_ods_bb_nba_crw01_player_s_salaries`
SET `playerid`=SUBSTRING_INDEX(
SUBSTRING_INDEX(
`playerid`,
'/',
-1),
'.',
'1')
2016-01-06 14:49:47
ALTER TABLE sports_ods_bb_nba_crw01_player_sim_career
ADD COLUMN
`simplayerid` VARCHAR(255) AFTER `player`;
ALTER TABLE sports_ods_bb_nba_crw01_player_sim_career
CHANGE COLUMN `simplayerid`
`simplayerid` VARCHAR(255)
AFTER `playerid`;
CREATE TABLE sports_ods_bb_nba_crw01_player_sim_thru
AS SELECT * FROM sports_ods_bb_nba_crw01_player_sim_career WHERE 1=2;
注意用单引号。
awk -F'\t' '{if (NF!=2) {print $0, NF}}' test.txt
2016-01-08 15:23:19
CREATE TEMPTABLE `temp_theme_20160106`
AS SELECT b.theme_name, a.*
FROM `t_themeid_cmsid` a
LEFT OUTER JOIN `t_theme` b
ON a.theme_id = b.theme_id
WHERE a.dt>='20160104'
CREATE TABLE `huati_theme_robot_20160104_0106_2`
AS SELECT b.theme_id, b.theme_name, a.*
FROM `temp_whole_network_huati` a
LEFT OUTER JOIN `temp_theme_20160106` b
ON a.news_id = b.news_id
WHERE a.dt >= '20160104';
# 怎样可以写成一句话搞定它
CREATE TABLE `temp_newsid_themename_0104_0106`
( `newsid` VARCHAR(255), `thema_name` VARCHAR(255));
LOAD DATA LOCAL INFILE '~/Downloads/0104_0106.txt'
INTO TABLE `temp_newsid_themename_0104_0106`
FIELDS TERMINATED BY '\t';
# 连接数据库
# -A可以use database的操作更快(快很多!!!)
mysql -P3306 -h10.210.208.48 -usuda -pwlz*od1ps -A;
2016-01-09 14:29:16
change file delimiter by awk
利用awk改变文件分隔符
awk ' BEGIN { FS="\t" ; OFS="\001" }; /.*/ { $1=$1; print $0}' filter2.txt > out.txt
简单方式:
awk ‘$1=$1’ FS=“\t” OFS=“\001” file
这里的$1=$1就是运算一下,以使新的分隔符生效
2016-01-11 11:18:24
import random
l = [_ for _ in xrange(1, 100)]
l.append(2016)
experiment_times = 100000
total = 0
for i in xrange(experiment_times):
# 这个问题的实际结果,和sample_size有关,因为sample_size决定了每个数出现在实际抽样数列中的概率。
sample_size = random.randint(0, 100)
# sample_size = 50
# 这种双重随机的采样方式,决定了每个数在最终的采样序列的概率是1/2,因为平均的采样长度会是总长度的一半,因此可以用邹博在ppt中的那个公式
samples = random.sample(l, sample_size)
result = 0
for sample in samples:
result ^= sample
total += result
print total / experiment_times
python sample的代码阅读
def sample(self, population, k):
"""Chooses k unique random elements from a population sequence.
Returns a new list containing elements from the population while
leaving the original population unchanged. The resulting list is
in selection order so that all sub-slices will also be valid random
samples. This allows raffle winners (the sample) to be partitioned
into grand prize and second place winners (the subslices).
Members of the population need not be hashable or unique. If the
population contains repeats, then each occurrence is a possible
selection in the sample.
To choose a sample in a range of integers, use xrange as an argument.
This is especially fast and space efficient for sampling from a
large population: sample(xrange(10000000), 60)
"""
# Sampling without replacement entails tracking either potential
# selections (the pool) in a list or previous selections in a set.
# When the number of selections is small compared to the
# population, then tracking selections is efficient, requiring
# only a small set and an occasional reselection. For
# a larger number of selections, the pool tracking method is
# preferred since the list takes less space than the
# set and it doesn't suffer from frequent reselections.
n = len(population)
if not 0 <= k <= n:
raise ValueError("sample larger than population")
random = self.random
_int = int
result = [None] * k
setsize = 21 # size of a small set minus size of an empty list
if k > 5:
setsize += 4 ** _ceil(_log(k * 3, 4)) # table size for big sets
if n <= setsize or hasattr(population, "keys"):
# An n-length list is smaller than a k-length set, or this is a
# mapping type so the other algorithm wouldn't work.
pool = list(population)
for i in xrange(k): # invariant: non-selected at [0,n-i)
j = _int(random() * (n-i))
result[i] = pool[j]
pool[j] = pool[n-i-1] # move non-selected item into vacancy
else:
try:
selected = set()
selected_add = selected.add
for i in xrange(k):
j = _int(random() * n)
while j in selected:
j = _int(random() * n)
selected_add(j)
result[i] = population[j]
except (TypeError, KeyError): # handle (at least) sets
if isinstance(population, list):
raise
return self.sample(tuple(population), k)
return result