NESCent · hlapp · May 2, 2014 · May 2, 2014 · Nov 17, 2016 · hlapp
diff --git a/scripts/all_datasets.py b/scripts/all_datasets.py
@@ -6,7 +6,13 @@
 >>> len(data_files)
 11
 '''
+'''
+# ls note: deposit date ideally would have been 2005 for all datasets; when not indicated by repository, 
+date of publication used (as per http://researchremix.wordpress.com/2011/02/16/choosing-repositories-for-the-tracking-data-reuse-project/:
+"In some repositories it is very difficult to determine the date of deposit. 
+I use date of article publication as an imperfect proxy.") cf. Jonathan's notes (https://notebooks.dataone.org/data-reuse/links-to-our-data/) 
 
+''' 
 import csv
 import os
 import fnmatch
@@ -15,14 +21,18 @@
 
 year_regex = re.compile('[(\[ ]([1-2][0-9]{3})[)\].]')
 
-data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), 
- '*_datasets.csv')
+# ls data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), 
+# ls '*_datasets.csv')
+
+data_files = fnmatch.filter(os.listdir('data/cleaner_old_all_datasets/'), 
+ '*.tsv')
 
 if __name__ == '__main__':
  print 'repo\tid\twos\tgs\tyear'
 
  for data_file in data_files:
- path = os.path.join('data/repo_datasets', data_file)
+# ls path = os.path.join('data/repo_datasets', data_file)
+ path = os.path.join('data/cleaner_old_all_datasets', data_file)
  repo = clean_repo_name(data_file[:-len('_datasets.csv')])
  with open(path) as input_file:
  r = csv.reader(input_file)
@@ -43,9 +53,10 @@
  gs_col = gs_cols[0]
  except IndexError:
  gs_col = None
-
  date_cols = [n for n, x in enumerate(header)
- if 'date' in x.lower() or 'year' in x.lower()]
+# ls if 'date' in x.lower() or 'year' in x.lower()]
+ if 'date made public' in x.lower()]
+
  try:
  assert len(date_cols) == 1
  date_col = date_cols[0]
@@ -57,6 +68,8 @@
  # if there's not a date column, try to parse it out of each 
  # line with regular expressions
  date_col = None
+
+
 
  for line in r:
  if len(line) <= 1: continue
@@ -69,6 +82,8 @@
  vals.append(wos)
 
  if gs_col is None: gs = 0
+# ls: do we want to account for NA's like this?
+ elif: gs_col == 'NA': gs = 0
  else: 
  gs = line[gs_col].split()[0]
  if not gs.strip(): gs = 0

diff --git a/scripts/get_refs.py b/scripts/get_refs.py
@@ -15,9 +15,11 @@ def main(file_path):
  vals = line.split('\t')
  repo = vals[1]
  accession = vals[2]
- longest = sorted(vals, key=lambda k: len(k), reverse=True)[0].replace('\n', '').replace('\r', '')
- if longest.startswith('http'): continue
- if len(longest) < 100: continue
+# ls longest = sorted(vals, key=lambda k: len(k), reverse=True)[0].replace('\n', '').replace('\r', '')
+# ls if longest.startswith('http'): continue
+# ls if len(longest) < 100: continue
+# article reference is in column 8 in the data table/spreadsheet that combines repos
+ longest = vals[8]
  print '\t'.join((repo, accession, longest))
 
 if __name__ == '__main__':