協作閣

開源協作部落格

Patent Application

CG

Freya / 2019-04-12 /


#install.packages('readxl')
library(readxl)
rawdata = readxl::read_excel("MGR_patent control.xlsx", sheet = 5)
#install.packages('tidyverse')
library(tidyverse)
applications = rawdata %>%
  separate(., Journal, into = c("data"), sep = ";", extra = "drop", remove = FALSE) %>% 
  separate(., data, into = c("front", "number"), sep = ":") %>% 
  separate(., number, into = c("extra", "code", "id", "junk", "date"), sep = " ") %>% 
  select(-extra, -junk) %>% 
  mutate(id = str_trim(id, side = "both")) %>% 
  unite(application_number, c("code", "id"), sep = "") %>% 
  mutate(application_number = str_trim(application_number, side = "both")) %>% 
  mutate(duplicated = duplicated(application_number))
head(applications)
#> # A tibble: 6 x 7
#>   Organism    Journal         front   Company  application_num… date  duplicated
#>   <chr>       <chr>           <chr>   <chr>    <chr>            <chr> <lgl>     
#> 1 methanocal… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… FALSE     
#> 2 pyrococcus… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… TRUE      
#> 3 pyrococcus… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… TRUE      
#> 4 pyrococcus… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… TRUE      
#> 5 pyrococcus… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… TRUE      
#> 6 pyrococcus… "JOURNAL   Pat… JOURNA… Abbott … WO2011053699-A1  05-M… TRUE
applications %>%
  count(duplicated)
#> # A tibble: 2 x 2
#>   duplicated     n
#>   <lgl>      <int>
#> 1 FALSE        999
#> 2 TRUE       11999
unique = applications %>% 
  filter(duplicated == FALSE) %>% 
  mutate(date = lubridate::dmy(date)) %>% 
  mutate(year = lubridate::year(date))
unique_year = unique %>% 
  count(year)

mean(unique_year$n)
#> [1] 41.625
median(unique_year$n)
#> [1] 47.5
unique_year %>% knitr::kable(align = "c")
year n
1988 1
1991 2
1992 1
1993 2
1998 3
1999 12
2000 15
2001 66
2002 67
2003 42
2004 56
2005 66
2006 67
2007 72
2008 68
2009 90
2010 89
2011 53
2012 54
2013 30
2014 37
2015 49
2016 46
2017 11
unique_year %>%
  ggplot(., aes(x=year, y = n)) +
  ylim(0,100) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 20)) +
  labs(title = "Trends in patent applications for Genetic Sequences", x = "application year", y = "applications")