201601 - JasonWayne/personal-wiki GitHub Wiki

2016年1月Code snippets

2016-01-05 18:23:08

# 'http://www.basketball-reference.com/players/b/bryanko01.html' --> bryanko01
UPDATE `sports_ods_bb_nba_crw01_player_s_salaries`
SET `playerid`=SUBSTRING_INDEX(
	SUBSTRING_INDEX(
		`playerid`,
		'/', 
		-1), 
	'.', 
    '1')

2016-01-06 14:49:47

ALTER TABLE sports_ods_bb_nba_crw01_player_sim_career
ADD COLUMN
`simplayerid` VARCHAR(255) AFTER `player`;
ALTER TABLE sports_ods_bb_nba_crw01_player_sim_career
CHANGE COLUMN `simplayerid` 
`simplayerid` VARCHAR(255) 
AFTER `playerid`;
CREATE TABLE sports_ods_bb_nba_crw01_player_sim_thru
AS SELECT * FROM sports_ods_bb_nba_crw01_player_sim_career WHERE 1=2;
注意用单引号。
awk -F'\t' '{if (NF!=2) {print $0, NF}}' test.txt

2016-01-08 15:23:19

CREATE TEMPTABLE `temp_theme_20160106`
AS SELECT b.theme_name, a.*
FROM `t_themeid_cmsid` a
LEFT OUTER JOIN `t_theme` b
ON a.theme_id = b.theme_id
WHERE a.dt>='20160104'

CREATE TABLE `huati_theme_robot_20160104_0106_2`
AS SELECT b.theme_id, b.theme_name, a.*
FROM `temp_whole_network_huati` a
LEFT OUTER JOIN `temp_theme_20160106` b
ON a.news_id = b.news_id
WHERE a.dt >= '20160104';
# 怎样可以写成一句话搞定它
CREATE TABLE `temp_newsid_themename_0104_0106`
( `newsid` VARCHAR(255), `thema_name` VARCHAR(255));

LOAD DATA LOCAL INFILE '~/Downloads/0104_0106.txt' 
INTO TABLE `temp_newsid_themename_0104_0106`
FIELDS TERMINATED BY '\t';
# 连接数据库
# -A可以use database的操作更快(快很多!!!)
mysql -P3306 -h10.210.208.48 -usuda -pwlz*od1ps -A;

2016-01-09 14:29:16

change file delimiter by awk
利用awk改变文件分隔符
awk ' BEGIN { FS="\t" ; OFS="\001" }; /.*/ { $1=$1; print $0}' filter2.txt > out.txt
简单方式:
awk ‘$1=$1’ FS=“\t” OFS=“\001” file
这里的$1=$1就是运算一下,以使新的分隔符生效

2016-01-11 11:18:24

import random

l = [_ for _ in xrange(1, 100)]
l.append(2016)

experiment_times = 100000

total = 0
for i in xrange(experiment_times):
    # 这个问题的实际结果,和sample_size有关,因为sample_size决定了每个数出现在实际抽样数列中的概率。
    sample_size = random.randint(0, 100)
    # sample_size = 50
    # 这种双重随机的采样方式,决定了每个数在最终的采样序列的概率是1/2,因为平均的采样长度会是总长度的一半,因此可以用邹博在ppt中的那个公式
    samples = random.sample(l, sample_size)

    result = 0
    for sample in samples:
        result ^= sample 
    
    total += result
print total / experiment_times

python sample的代码阅读

    def sample(self, population, k):
        """Chooses k unique random elements from a population sequence.

        Returns a new list containing elements from the population while
        leaving the original population unchanged.  The resulting list is
        in selection order so that all sub-slices will also be valid random
        samples.  This allows raffle winners (the sample) to be partitioned
        into grand prize and second place winners (the subslices).

        Members of the population need not be hashable or unique.  If the
        population contains repeats, then each occurrence is a possible
        selection in the sample.

        To choose a sample in a range of integers, use xrange as an argument.
        This is especially fast and space efficient for sampling from a
        large population:   sample(xrange(10000000), 60)
        """

        # Sampling without replacement entails tracking either potential
        # selections (the pool) in a list or previous selections in a set.

        # When the number of selections is small compared to the
        # population, then tracking selections is efficient, requiring
        # only a small set and an occasional reselection.  For
        # a larger number of selections, the pool tracking method is
        # preferred since the list takes less space than the
        # set and it doesn't suffer from frequent reselections.

        n = len(population)
        if not 0 <= k <= n:
            raise ValueError("sample larger than population")
        random = self.random
        _int = int
        result = [None] * k
        setsize = 21        # size of a small set minus size of an empty list
        if k > 5:
            setsize += 4 ** _ceil(_log(k * 3, 4)) # table size for big sets
        if n <= setsize or hasattr(population, "keys"):
            # An n-length list is smaller than a k-length set, or this is a
            # mapping type so the other algorithm wouldn't work.
            pool = list(population)
            for i in xrange(k):         # invariant:  non-selected at [0,n-i)
                j = _int(random() * (n-i))
                result[i] = pool[j]
                pool[j] = pool[n-i-1]   # move non-selected item into vacancy
        else:
            try:
                selected = set()
                selected_add = selected.add
                for i in xrange(k):
                    j = _int(random() * n)
                    while j in selected:
                        j = _int(random() * n)
                    selected_add(j)
                    result[i] = population[j]
            except (TypeError, KeyError):   # handle (at least) sets
                if isinstance(population, list):
                    raise
                return self.sample(tuple(population), k)
        return result