years - Daria-Maltseva/mixedmethods GitHub Wiki

Final Dataset

December 2019

Hits

setwd("C:/Mail.Ru Cloud/ANR HSE/ANR Projects/Mixed methods/Data/2019_FINAL DATASET_copy")

# hits 
number<-read.table(file="DC1.clu", sep=",", header=FALSE, skip=2)$V1
t<-table(number)
t
years <- as.integer(names(t))
length(years)
years
year <- years[11:34]
year
length(year)

freq <- t[11:34]
length(freq)
freq
yt <- c(0,100,500,1000,2000,3000, 5000)
plot(year,freq,cex=1,main="Hits per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
axis(side=2,at=yt, labels=yt)

model <- nls(freq~c*a^(year-1992),start=list(c=10000,a=2)) #Vlado proposed these values for c and a 
model

Nonlinear regression model
  model: freq ~ c * a^(year - 1992)
   data: parent.frame()
    c     a 
3.762 1.326 
 residual sum-of-squares: 51275

Number of iterations to convergence: 34 
Achieved convergence tolerance: 1.057e-06

plot(year,freq,cex=1,main="Hits per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
lines(year,predict(model,list(x=year)),col="red",lw=2)

Then count the N of years when the field doubles: a^x = 2;
xlog a = log 2;
x = log 2 / log 2

log(2)/log(1.326)
OR: 
summary(model)
coef(model)
a <- coef(model)
a[1]
a[2] - we need this 
log(2)/log(a[2])

Cited only

# cited only 

Y<-read.table(file="DC0.clu", sep=",", header=FALSE, skip=2)$V1
t <- table(Y)
years <- as.integer(names(t))
length(years)
years

year0 <- years[132:252]
year0
freq0 <- t[132:252]

#log = "y", 
length(freq0)
yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
axis(side=2,at=yt, labels=yt)

model <- nls(freq0~c*dlnorm(2019-year0,a,b),start=list(c=1e6,a=2,b=0.7))
model
max(freq)
plot(year0,freq0,cex=0.75,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq")
axis(side=2,at=yt, labels=yt)
lines(year0,predict(model,list(x=2019-year0)),col="red",lw=2)

model

Nonlinear regression model
  model: freq0 ~ c * dlnorm(2019 - year0, a, b)
   data: parent.frame()
        c         a         b 
5.311e+05 2.480e+00 6.351e-01 
 residual sum-of-squares: 36815713

Number of iterations to convergence: 8 
Achieved convergence tolerance: 1.59e-06

> coef(model)
           c            a            b 
5.311115e+05 2.479916e+00 6.350575e-01 
> 

Preparation

DC.clu

==============================================================================
3. C:\Mail.Ru Cloud\ANR HSE\ANR Projects\Mixed methods\Data\2019_FINAL DATASET_copy\DC.clu (547407)
==============================================================================
Dimension: 547407
The lowest value:  0
The highest value: 5

Frequency distribution of cluster values:

   Cluster      Freq     Freq%   CumFreq  CumFreq% Representative
 ----------------------------------------------------------------
         0    529238   96.6809    529238   96.6809 ABBOTT_L(2015)26:340
         1     14636    2.6737    543874   99.3546 HILL_M(2018)34:71
         2        11    0.0020    543885   99.3566 WHITTEMO_R(2005)52:546
         4      1873    0.3422    545758   99.6988 COLLINS_K(2006)4:67
         5      1649    0.3012    547407  100.0000 ANDERSON_V(2017)28:125
 ----------------------------------------------------------------
       Sum    547407  100.0000

Transformed: binarized

==============================================================================
6. Binarized C3 [1-*] (547407)
==============================================================================
Dimension: 547407
The lowest value:  0
The highest value: 1

Frequency distribution of cluster values:

   Cluster      Freq     Freq%   CumFreq  CumFreq% Representative
 ----------------------------------------------------------------
         0    529238   96.6809    529238   96.6809 ABBOTT_L(2015)26:340
         1     18169    3.3191    547407  100.0000 HILL_M(2018)34:71
 ----------------------------------------------------------------
       Sum    547407  100.0000

Saved as DCbin.clu
According to this, constructed subpartitions for years: DC1.clu and DC0.clu.

Distributions in R

Hits

> setwd("C:/Mail.Ru Cloud/ANR HSE/ANR Projects/Mixed methods/Data/2019_FINAL DATASET_copy")
> number<-read.table(file="DC1.clu", sep=",", header=FALSE, skip=2)$V1
> t<-table(number)
> head(t)
number
   0 1959 1967 1978 1979 1983 
  15    1    1    1    1    1 
> years <- as.integer(names(t))
> length(years)
[1] 37
> year <- years[2:37]
> year
 [1] 1959 1967 1978 1979 1983 1985 1988 1989 1991 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
[21] 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
> length(year)
[1] 36
> freq <- t[2:37]
> length(freq)
[1] 36
> freq
number
1959 1967 1978 1979 1983 1985 1988 1989 1991 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 
   1    1    1    1    1    1    2    4    1    5    4    4    6   13    8   12   15   13   27   41 
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 
  54   79  133  209  354  465  684  822 1045 1429 1827 2609 3214 3600 1142  326 
> yt <- c(0,100,500,1000,2000,3000)
> plot(year,freq,cex=1,main="Hits per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
> axis(side=2,at=yt, labels=yt)

Cited only

> Y<-read.table(file="DC0.clu", sep=",", header=FALSE, skip=2)$V1
> t <- table(Y)
> years <- as.integer(names(t))
> length(years)
[1] 252
> years
  [1]    0    1    2 1003 1093 1142 1513 1575 1595 1603 1610 1651 1669 1676 1687 1707 1710 1724 1736
 [20] 1739 1748 1750 1751 1756 1757 1758 1762 1775 1776 1781 1785 1786 1787 1788 1789 1790 1791 1794
 [39] 1795 1796 1797 1802 1803 1805 1807 1808 1809 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820
 [58] 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1833 1834 1835 1836 1837 1838 1839 1840
 [77] 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1857 1858 1859 1860
 [96] 1861 1862 1863 1864 1865 1867 1868 1869 1870 1871 1872 1875 1876 1877 1878 1879 1880 1881 1882
[115] 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
[134] 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
[153] 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939
[172] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
[191] 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
[210] 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
[229] 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
[248] 2016 2017 2018 2019 2020
> year0 <- years[7:243]
> year0
  [1] 1513 1575 1595 1603 1610 1651 1669 1676 1687 1707 1710 1724 1736 1739 1748 1750 1751 1756 1757
 [20] 1758 1762 1775 1776 1781 1785 1786 1787 1788 1789 1790 1791 1794 1795 1796 1797 1802 1803 1805
 [39] 1807 1808 1809 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
 [58] 1827 1828 1829 1830 1831 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
 [77] 1847 1848 1849 1850 1851 1852 1853 1854 1855 1857 1858 1859 1860 1861 1862 1863 1864 1865 1867
 [96] 1868 1869 1870 1871 1872 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
[115] 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
[134] 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926
[153] 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
[172] 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964
[191] 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
[210] 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
[229] 2003 2004 2005 2006 2007 2008 2009 2010 2011
> freq0 <- t[7:243]
> length(freq0)
[1] 237
> yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
> plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
> axis(side=2,at=yt, labels=yt)
> length(freq0)
[1] 237
> yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
> plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
> axis(side=1,at=yt, labels=yt)
> length(freq0)
[1] 237
> yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
> plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
> axis(side=2,at=yt, labels=yt)
> model <- nls(freq0~c*dlnorm(2019-year0,a,b),start=list(c=1e6,a=2,b=0.7))
> model
Nonlinear regression model
  model: freq0 ~ c * dlnorm(2019 - year0, a, b)
   data: parent.frame()
        c         a         b 
4.830e+05 2.517e+00 5.533e-01 
 residual sum-of-squares: 7720160

Number of iterations to convergence: 7 
Achieved convergence tolerance: 1.711e-06
> freq0 
Y
 1513  1575  1595  1603  1610  1651  1669  1676  1687  1707  1710  1724  1736  1739  1748  1750  1751 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1 
 1756  1757  1758  1762  1775  1776  1781  1785  1786  1787  1788  1789  1790  1791  1794  1795  1796 
    1     1     1     1     1     3     1     1     1     1     1     1     2     1     3     2     1 
 1797  1802  1803  1805  1807  1808  1809  1811  1812  1813  1814  1815  1816  1817  1818  1819  1820 
    1     1     1     1     2     1     2     3     1     2     1     3     1     2     4     1     1 
 1821  1822  1823  1824  1825  1826  1827  1828  1829  1830  1831  1833  1834  1835  1836  1837  1838 
    1     2     1     4     2     2     5     2     6     4     4     7     4     5     3     4     5 
 1839  1840  1841  1842  1843  1844  1845  1846  1847  1848  1849  1850  1851  1852  1853  1854  1855 
    4     3     1     2     4     4     2     1     1     2     2     2     1     1     1     1     2 
 1857  1858  1859  1860  1861  1862  1863  1864  1865  1867  1868  1869  1870  1871  1872  1875  1876 
    4     6     3     3     2     1     2     2     2     2     2     2     3     5     3     1     4 
 1877  1878  1879  1880  1881  1882  1883  1884  1885  1886  1887  1888  1889  1890  1891  1892  1893 
    5     4     2     2     1     3     3     2     6     3     2     2     8     9     3    11     6 
 1894  1895  1896  1897  1898  1899  1900  1901  1902  1903  1904  1905  1906  1907  1908  1909  1910 
    3     5     8     8     9     9     7    10    13    11    17    14    10     9    18    10    23 
 1911  1912  1913  1914  1915  1916  1917  1918  1919  1920  1921  1922  1923  1924  1925  1926  1927 
   11    13    18    16    16    18     9     5    11    18    16    19     8    16    25    21    25 
 1928  1929  1930  1931  1932  1933  1934  1935  1936  1937  1938  1939  1940  1941  1942  1943  1944 
   24    25    35    26    37    31    32    38    33    40    37    37    18    29    27    29    22 
 1945  1946  1947  1948  1949  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
   32    51    60    75    84    77    67    66    89   108    96   115   135   144   145   166   166 
 1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973  1974  1975  1976  1977  1978 
  189   198   258   239   278   332   351   388   442   440   496   563   575   666   710   848   959 
 1979  1980  1981  1982  1983  1984  1985  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995 
 1019  1124  1196  1374  1575  1664  1934  2119  2421  2585  2958  3757  4168  4552  5035  5932  6857 
 1996  1997  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009  2010  2011 
 7747  8423 10106 11349 14837 15865 18059 20120 22195 24837 26844 28466 30177 31272 32849 32714 

Removing years before 1900 and adding after 2011:

> t <- table(Y)
> years <- as.integer(names(t))
> length(years)
[1] 252
> years
  [1]    0    1    2 1003 1093 1142 1513 1575 1595 1603 1610 1651 1669 1676 1687 1707 1710 1724 1736
 [20] 1739 1748 1750 1751 1756 1757 1758 1762 1775 1776 1781 1785 1786 1787 1788 1789 1790 1791 1794
 [39] 1795 1796 1797 1802 1803 1805 1807 1808 1809 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820
 [58] 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1833 1834 1835 1836 1837 1838 1839 1840
 [77] 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1857 1858 1859 1860
 [96] 1861 1862 1863 1864 1865 1867 1868 1869 1870 1871 1872 1875 1876 1877 1878 1879 1880 1881 1882
[115] 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
[134] 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
[153] 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939
[172] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
[191] 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
[210] 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
[229] 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
[248] 2016 2017 2018 2019 2020
> year0 <- years[126:252]
> year0
  [1] 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
 [20] 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
 [39] 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950
 [58] 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969
 [77] 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
 [96] 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
[115] 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
> freq0 <- t[126:252]
> 
> length(freq0)
[1] 127
> yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
> plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
> axis(side=2,at=yt, labels=yt)
> 
> model <- nls(freq0~c*dlnorm(2019-year0,a,b),start=list(c=1e6,a=2,b=0.7))
> model
Nonlinear regression model
  model: freq0 ~ c * dlnorm(2019 - year0, a, b)
   data: parent.frame()
        c         a         b 
5.311e+05 2.480e+00 6.351e-01 
 residual sum-of-squares: 36815818

Number of iterations to convergence: 8 
Achieved convergence tolerance: 1.591e-06
> max(freq)
[1] 3600
> plot(year0,freq0,cex=0.75,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq")
> axis(side=2,at=yt, labels=yt)
> lines(year0,predict(model,list(x=2019-year0)),col="red",lw=2)
> 

freq

 1513  1575  1595  1603  1610  1651  1669  1676  1687  1707  1710  1724  1736  1739  1748  1750  1751 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1 
 1756  1757  1758  1762  1775  1776  1781  1785  1786  1787  1788  1789  1790  1791  1794  1795  1796 
    1     1     1     1     1     3     1     1     1     1     1     1     2     1     3     2     1 
 1797  1802  1803  1805  1807  1808  1809  1811  1812  1813  1814  1815  1816  1817  1818  1819  1820 
    1     1     1     1     2     1     2     3     1     2     1     3     1     2     4     1     1 
 1821  1822  1823  1824  1825  1826  1827  1828  1829  1830  1831  1833  1834  1835  1836  1837  1838 
    1     2     1     4     2     2     5     2     6     4     4     7     4     5     3     4     5 
 1839  1840  1841  1842  1843  1844  1845  1846  1847  1848  1849  1850  1851  1852  1853  1854  1855 
    4     3     1     2     4     4     2     1     1     2     2     2     1     1     1     1     2 
 1857  1858  1859  1860  1861  1862  1863  1864  1865  1867  1868  1869  1870  1871  1872  1875  1876 
    4     6     3     3     2     1     2     2     2     2     2     2     3     5     3     1     4 
 1877  1878  1879  1880  1881  1882  1883  1884  1885  1886  1887  1888  1889  1890  1891  1892  1893 
    5     4     2     2     1     3     3     2     6     3     2     2     8     9     3    11     6 
 1894  1895  1896  1897  1898  1899  1900  1901  1902  1903  1904  1905  1906  1907  1908  1909  1910 
    3     5     8     8     9     9     7    10    13    11    17    14    10     9    18    10    23 
 1911  1912  1913  1914  1915  1916  1917  1918  1919  1920  1921  1922  1923  1924  1925  1926  1927 
   11    13    18    16    16    18     9     5    11    18    16    19     8    16    25    21    25 
 1928  1929  1930  1931  1932  1933  1934  1935  1936  1937  1938  1939  1940  1941  1942  1943  1944 
   24    25    35    26    37    31    32    38    33    40    37    37    18    29    27    29    22 
 1945  1946  1947  1948  1949  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
   32    51    60    75    84    77    67    66    89   108    96   115   135   144   145   166   166 
 1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973  1974  1975  1976  1977  1978 
  189   198   258   239   278   332   351   388   442   440   496   563   575   666   710   848   959 
 1979  1980  1981  1982  1983  1984  1985  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995 
 1019  1124  1196  1374  1575  1664  1934  2119  2421  2585  2958  3757  4168  4552  5035  5932  6857 
 1996  1997  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009  2010  2011  2012 
 7747  8423 10106 11349 14837 15865 18059 20120 22195 24837 26844 28466 30177 31272 32849 32714 32126 
 2013  2014  2015  2016  2017  2018  2019  2020 
30451 25867 20012 12030  4606  1137   130     1 

Old dataset

Hits

setwd("C:/Mail.Ru Cloud/ANR HSE/ANR Projects/Mixed methods/WoS2Pajek Data with EXTRA 3rd/Additional search from July 2019")

number<-read.table(file="DC1.clu", sep=",", header=FALSE, skip=2)$V1
t<-table(number)
head(t)
t
years <- as.integer(names(t))
length(years)
years
year <- years[2:37]
year
length(year)

freq <- t[2:37]
length(freq)
freq
yt <- c(0,100,500,1000,2000,3000)
plot(year,freq,cex=1,main="Hits per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
axis(side=2,at=yt, labels=yt)

Cited only works

Y<-read.table(file="DC0.clu", sep=",", header=FALSE, skip=2)$V1
t <- table(Y)
years <- as.integer(names(t))
length(years)
years
year0 <- years[7:243]
freq0 <- t[7:243]

year0

length(freq0)
yt <- c(0,1000,5000,10000,15000, 20000, 30000); 
plot(year0,freq0,cex=1,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq", pch=16)
axis(side=2,at=yt, labels=yt)

model <- nls(freq0~c*dlnorm(2019-year0,a,b),start=list(c=1e6,a=2,b=0.7))
model
max(freq)
plot(year0,freq0,cex=0.75,main="Cited only works per year", yaxt="n", xlab = "Years", ylab = "Freq")
axis(side=2,at=yt, labels=yt)
lines(year0,predict(model,list(x=2019-year0)),col="red",lw=2)

Fitting result:

> model <- nls(freq0~c*dlnorm(2019-year0,a,b),start=list(c=1e6,a=2,b=0.7))
> model
Nonlinear regression model
  model: freq0 ~ c * dlnorm(2019 - year0, a, b)
   data: parent.frame()
        c         a         b 
4.840e+05 2.482e+00 6.227e-01 
 residual sum-of-squares: 24739746

Number of iterations to convergence: 8 
Achieved convergence tolerance: 1.261e-06