Revised MD17 dataset


JSON Export

{
  "revision": 7, 
  "id": "466", 
  "created": "2020-07-18T18:58:23.043827+00:00", 
  "metadata": {
    "doi": "10.24435/materialscloud:wy-kn", 
    "status": "published", 
    "title": "Revised MD17 dataset", 
    "mcid": "2020.82", 
    "license_addendum": null, 
    "_files": [
      {
        "description": "Tarfile containing the data in NPZ and CSV format", 
        "key": "rmd17.tar.bz2", 
        "size": 1066301513, 
        "checksum": "md5:cb1a927628d96f2e966025da4fb63d18"
      }, 
      {
        "description": "Readme file", 
        "key": "readme.txt", 
        "size": 2082, 
        "checksum": "md5:29e6f250bb2d1c461363e24955b5be1e"
      }
    ], 
    "owner": 160, 
    "_oai": {
      "id": "oai:materialscloud.org:466"
    }, 
    "keywords": [
      "Chemistry", 
      "Machine Learning", 
      "Noise", 
      "Forces", 
      "Energies", 
      "Molecules"
    ], 
    "conceptrecid": "465", 
    "is_last": true, 
    "references": [
      {
        "type": "Journal reference", 
        "doi": "10.1088/2632-2153/abba6f", 
        "url": "https://iopscience.iop.org/article/10.1088/2632-2153/abba6f", 
        "comment": "Paper in which data is presented", 
        "citation": "A. S. Christensen, O. A. von Lilienfeld"
      }, 
      {
        "type": "Preprint", 
        "doi": "arXiv:2007.09593", 
        "url": "https://arxiv.org/abs/2007.09593", 
        "comment": "Preprint in which data is presented", 
        "citation": "Anders S. Christensen, O. Anatole von Lilienfeld, arXiv:2007.09593  (2020)"
      }
    ], 
    "publication_date": "Jul 23, 2020, 18:50:23", 
    "license": "Creative Commons Zero v1.0 Universal", 
    "id": "466", 
    "description": "The original MD17 dataset (http://quantum-machine.org/datasets/#md-datasets) [Chemiela et al. Sci. Adv. 3(5), e1603015, 2017] contains numerical noise. Thus, any numbers presented from benchmarks on this data are likely flawed. Here, we present a new dataset with negligible numerical noise for benchmarking of forces and energy predictions for molecular dynamics simulations. As the structures are taken from a molecular dynamics simulation (i.e. time series data), they are not guaranteed to be independent samples. This is easily evident from the autocorrelation function for the original MD17 dataset. In short: DO NOT train a model on more than 1000 samples from the revised dataset, and do not train models for more than 50 samples from the original MD17 dataset. Data already published with 50K samples on the original MD17 dataset should be considered meaningless due to this fact and due to the noise in the original data.", 
    "version": 1, 
    "contributors": [
      {
        "email": "anders.christensen@unibas.ch", 
        "affiliations": [
          "Department of Chemistry, University of Basel, Switzerland"
        ], 
        "familyname": "Christensen", 
        "givennames": "Anders"
      }, 
      {
        "email": "anatole.vonlilienfeld@unibas.ch", 
        "affiliations": [
          "Department of Chemistry, University of Basel, Switzerland"
        ], 
        "familyname": "von Lilienfeld", 
        "givennames": "O. Anatole"
      }
    ], 
    "edited_by": 160
  }, 
  "updated": "2021-01-08T09:22:49.214302+00:00"
}